# PROBLEM 3 : Pairwise Feature selection for text

## Chi2 features selection

In [10]:
def read_20ng_and_convert(file_path='20ng.csv'):
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd

    df = pd.read_csv(file_path)
    df['index'] = df['index'].str.replace(r'\d+$', '', regex=True)
    classes = ["alt.atheism", "sci.med", "sci.electronics", "comp.graphics", "talk.politics.guns", "sci.crypt"]
    df = df[df['index'].isin(classes)]

    vectorizer = TfidfVectorizer(use_idf=False)
    sparse_matrix = vectorizer.fit_transform(df['content'])

    sparse_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=vectorizer.get_feature_names_out())
    sparse_df['index'] = df['index'].to_numpy()
    return sparse_df


In [11]:
df_20ng = read_20ng_and_convert()

In [12]:
df_20ng_x = df_20ng.drop('index', axis=1)
df_20ng_y = df_20ng['index'].to_numpy()

In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_20ng_y = label_encoder.fit_transform(df_20ng_y)


In [7]:
df_20ng_x = df_20ng_x.to_numpy()

In [14]:
from sklearn.feature_selection import SelectKBest, chi2

chi = SelectKBest(score_func=chi2, k=200)
top_200_features = chi.fit_transform(df_20ng_x, df_20ng_y)

top_feature_indices = chi.get_support(indices=True)

chi2_scores = chi.scores_
p_values = chi.pvalues_

In [15]:
chi2_scores

array([1.15409258, 3.89985732, 4.43586859, ..., 0.186339  , 0.186339  ,
       0.55901699])

In [16]:
p_values

array([0.94920501, 0.5639224 , 0.48850958, ..., 0.99925391, 0.99925391,
       0.9898027 ])

In [19]:
df_20ng_x.columns[top_feature_indices]

Index(['3d', 'abortion', 'ac', 'access', 'acm', 'algorithm', 'alt', 'amanda',
       'amp', 'animation',
       ...
       'was', 'we', 'weapons', 'were', 'windows', 'wiretap', 'wpd', 'wwc',
       'yeast', 'you'],
      dtype='object', length=200)

In [21]:
df_20ng_with_top_features = df_20ng_x.iloc[:, top_feature_indices]

In [22]:
df_20ng_with_top_features.shape

(6000, 200)

## Classifiers

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_20ng_with_top_features, df_20ng_y, test_size=0.2, random_state=42)

### Decision Classifier

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred)
dt_accuracy

0.995

### L2-reg Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
lr_accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9808333333333333

For decision performace is the same but in this case the number of features are less. For l2-reg logisitic regression, model after chi2 features increases the performance from 0.975 to 0.98.

## Mutual Information features selection

In [29]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Assuming df_20ng_x = features (without 'index' column)
# and df_20ng_y = target labels

mi = SelectKBest(score_func=mutual_info_classif, k=200)
X_selected = mi.fit_transform(df_20ng_x, df_20ng_y)

# To get indices and names of top features
top_feature_indices = mi.get_support(indices=True)



In [32]:
df_20ng_with_top_features = df_20ng_x.iloc[:, top_feature_indices]

## Classifiers

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_20ng_with_top_features, df_20ng_y, test_size=0.2, random_state=42)

### Decision Classifier

In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred)
dt_accuracy

0.9966666666666667

### L2-reg Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
lr_accuracy

0.9591666666666666