In [35]:
def read_20ng_and_convert(file_path='20ng.csv'):
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd

    df = pd.read_csv(file_path)
    df['index'] = df['index'].str.replace(r'\d+$', '', regex=True)
    classes = ["alt.atheism", "sci.med", "sci.electronics", "comp.graphics", "talk.politics.guns", "sci.crypt"]
    df = df[df['index'].isin(classes)]

    vectorizer = TfidfVectorizer(use_idf=False)
    sparse_matrix = vectorizer.fit_transform(df['content'])

    sparse_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=vectorizer.get_feature_names_out())
    sparse_df['index'] = df['index'].to_numpy()
    return sparse_df


In [36]:
df_20ng = read_20ng_and_convert()

In [43]:
df_20ng_x = df_20ng.drop('index', axis=1)
df_20ng_y = df_20ng['index'].to_numpy()

In [44]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_20ng_y = label_encoder.fit_transform(df_20ng_y)


In [45]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1)


In [46]:
df_20ng_x_num = df_20ng_x.to_numpy()
lasso.fit(df_20ng_x, df_20ng_y)

In [47]:
coefficients = lasso.coef_

In [48]:
coefficients

array([-0.,  0., -0., ..., -0., -0., -0.])

In [49]:
import numpy as np
top_features_df = df_20ng_x.iloc[:, np.argsort(np.abs(coefficients))[-200:]]


In [50]:
top_features_df.shape

(6000, 200)

## Classifier

In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(top_features_df, df_20ng_y, test_size=0.2, random_state=42)

In [56]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred)
dt_accuracy

0.2875