##PROBLEM 3 : Pairwise Feature selection for text
On 20NG, run feature selection using skikit-learn built in "chi2" criteria to select top 200 features. Rerun a classification task, compare performance with HW3A-PB1. Then repeat the whole pipeline with "mutual-information" criteria.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import wordnet
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

#### Preprocessing 20NG

In [None]:
# 20NG dataset
file_path = '/content/drive/MyDrive/USML/HW 3A/20NG/train_20NG_wo_header_footer_quotes.pkl'

with open(file_path, 'rb') as file:
    data = pickle.load(file)

def preprocess(text):
  def is_valid_word(word):
    return bool(wordnet.synsets(word))
  text = re.sub(r'[^a-zA-Z0-9 \n]', '', text) # removing any character that is not an alphanumeric character (letters and digits), a space, or a newline (\n)
  text = re.sub(r'\n+', ' ', text) # removing new line
  text = text.lower() # lower casing
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words] # removing stop words
  filtered_tokens = [word for word in tokens if is_valid_word(word)] # removing non-english word

  return ' '.join(filtered_tokens)

preprocessed_data = []

for text in data['data']:
  preprocessed_data.append(preprocess(text))

vectorizer = TfidfVectorizer()
tng_data = vectorizer.fit_transform(preprocessed_data) # SHAPE:

tng_labels = data['target']

num_nonzero_elements = tng_data.nnz
total_elements = tng_data.shape[0] * tng_data.shape[1]
sparsity_ratio = 1 - (num_nonzero_elements / total_elements)
print(f"Sparsity ratio: {sparsity_ratio:.4f}")
print("Shape: ",tng_data.shape)

Sparsity ratio: 0.9983
Shape:  (11314, 32863)


In [None]:
tng_train_data, tng_test_data, tng_train_lbl, tng_test_lbl = train_test_split(tng_data, tng_labels, test_size=0.2, random_state=42)

####Chi2

In [None]:
k_chi2_best = SelectKBest(score_func=chi2, k=200)
tng_chi2 = k_chi2_best.fit_transform(tng_train_data, tng_train_lbl)

####Mutual Information Gain

In [None]:
k_mi_best = SelectKBest(score_func=mutual_info_classif, k=200)
tng_mutual_info = k_mi_best.fit_transform(tng_train_data, tng_train_lbl)

###Logistic Regression on 20NG

In [None]:
log_reg_chi2 = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, multi_class='multinomial')
log_reg_chi2.fit(tng_chi2, tng_train_lbl)

In [None]:
log_reg_mi = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, multi_class='multinomial')
log_reg_mi.fit(tng_mutual_info, tng_train_lbl)

#### Classification performance

In [None]:
# Performance of Logistic Regression in HW3A-PB1


Accuracy: 0.7163
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.67      0.68        97
           1       0.65      0.69      0.67       104
           2       0.68      0.65      0.67       115
           3       0.65      0.65      0.65       123
           4       0.79      0.55      0.65       126
           5       0.72      0.82      0.77       106
           6       0.69      0.72      0.71       109
           7       0.75      0.73      0.74       139
           8       0.50      0.80      0.62       122
           9       0.71      0.78      0.74       102
          10       0.88      0.79      0.83       108
          11       0.92      0.83      0.87       125
          12       0.62      0.65      0.64       114
          13       0.76      0.78      0.77       119
          14       0.76      0.83      0.79       127
          15       0.72      0.77      0.75       122
          16       0.72      0.77      0

In [None]:
tng_test_chi2 = k_chi2_best.transform(tng_test_data)

print('Performance after chi2')
tng_chi2_pred = log_reg_chi2.predict(tng_test_chi2)
accuracy = accuracy_score(tng_test_lbl, tng_chi2_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
class_report = classification_report(tng_test_lbl, tng_chi2_pred)
print("Classification Report:")
print(class_report)

Performance after chi2
Accuracy: 0.5117
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.32      0.43        97
           1       0.56      0.46      0.51       104
           2       0.68      0.60      0.64       115
           3       0.52      0.54      0.53       123
           4       0.80      0.32      0.45       126
           5       0.63      0.60      0.62       106
           6       0.63      0.56      0.59       109
           7       0.78      0.56      0.65       139
           8       0.87      0.48      0.61       122
           9       0.58      0.52      0.55       102
          10       0.74      0.62      0.67       108
          11       0.84      0.74      0.78       125
          12       0.11      0.67      0.18       114
          13       0.82      0.50      0.62       119
          14       0.82      0.57      0.67       127
          15       0.63      0.69      0.66       122
          16      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
tng_test_mutual_info = k_mi_best.transform(tng_test_data)

print('Performance after Mutual Information: ')
tng_mi_pred = log_reg_mi.predict(tng_test_mutual_info)
accuracy = accuracy_score(tng_test_lbl, tng_mi_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
class_report = classification_report(tng_test_lbl, tng_mi_pred)
print("Classification Report:")
print(class_report)

Performance after Mutual Information: 
Accuracy: 0.2554
Classification Report:
              precision    recall  f1-score   support

           0       0.14      0.03      0.05        97
           1       0.26      0.36      0.30       104
           2       0.62      0.57      0.60       115
           3       0.33      0.24      0.28       123
           4       0.21      0.07      0.11       126
           5       0.24      0.39      0.30       106
           6       0.31      0.63      0.41       109
           7       0.31      0.06      0.10       139
           8       0.13      0.15      0.14       122
           9       0.15      0.23      0.18       102
          10       0.13      0.37      0.19       108
          11       0.38      0.45      0.41       125
          12       0.26      0.20      0.23       114
          13       0.18      0.18      0.18       119
          14       0.24      0.17      0.19       127
          15       0.39      0.52      0.45       122
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###Decision Tree on 20NG

In [None]:
tree_clf_tng_low_depth_chi2 = DecisionTreeClassifier(class_weight='balanced',max_depth=100)
tree_clf_tng_low_depth_chi2.fit(tng_chi2, tng_train_lbl)

tree_clf_tng_max_depth_chi2 = DecisionTreeClassifier(class_weight='balanced')
tree_clf_tng_max_depth_chi2.fit(tng_chi2, tng_train_lbl)

tree_clf_tng_low_depth_mi = DecisionTreeClassifier(class_weight='balanced',max_depth=100)
tree_clf_tng_low_depth_mi.fit(tng_mutual_info, tng_train_lbl)

tree_clf_tng_max_depth_mi = DecisionTreeClassifier(class_weight='balanced')
tree_clf_tng_max_depth_mi.fit(tng_mutual_info, tng_train_lbl)

In [None]:
print(f"Depth of low-depth chi2 tree: {tree_clf_tng_low_depth_chi2.get_depth()} \nDepth of max-depth chi2 tree: {tree_clf_tng_max_depth_chi2.get_depth()}")

Depth of low-depth chi2 tree: 100 
Depth of max-depth chi2 tree: 280


In [None]:
print(f"Depth of low-depth MI tree: {tree_clf_tng_low_depth_mi.get_depth()} \nDepth of max-depth MI tree: {tree_clf_tng_max_depth_mi.get_depth()}")

Depth of low-depth MI tree: 100 
Depth of max-depth MI tree: 337


#### Classification performance

######Low Depth Decision Tree

In [None]:
# Performance of Low Depth Decision Tree in HW3A-PB1

Accuracy: 0.4565
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.31      0.33        97
           1       0.34      0.38      0.35       104
           2       0.49      0.50      0.50       115
           3       0.34      0.39      0.36       123
           4       0.46      0.32      0.38       126
           5       0.50      0.54      0.52       106
           6       0.52      0.50      0.51       109
           7       0.25      0.60      0.35       139
           8       0.58      0.50      0.54       122
           9       0.44      0.45      0.45       102
          10       0.62      0.52      0.57       108
          11       0.69      0.62      0.66       125
          12       0.40      0.32      0.36       114
          13       0.61      0.55      0.58       119
          14       0.62      0.46      0.53       127
          15       0.54      0.50      0.52       122
          16       0.48      0.40      0.

In [None]:
tng_test_chi2 = k_chi2_best.transform(tng_test_data)

print('Performance after chi2')
tng_chi2_pred = tree_clf_tng_low_depth_chi2.predict(tng_test_chi2)
accuracy = accuracy_score(tng_test_lbl, tng_chi2_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
class_report = classification_report(tng_test_lbl, tng_chi2_pred)
print("Classification Report:")
print(class_report)

Performance after chi2
Accuracy: 0.4141
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.25      0.32        97
           1       0.43      0.28      0.34       104
           2       0.57      0.35      0.43       115
           3       0.49      0.35      0.41       123
           4       0.54      0.25      0.34       126
           5       0.63      0.57      0.60       106
           6       0.52      0.45      0.48       109
           7       0.72      0.42      0.53       139
           8       0.74      0.47      0.57       122
           9       0.65      0.29      0.41       102
          10       0.80      0.51      0.62       108
          11       0.83      0.68      0.75       125
          12       0.32      0.11      0.17       114
          13       0.86      0.43      0.57       119
          14       0.75      0.46      0.57       127
          15       0.57      0.41      0.48       122
          16      

In [None]:
tng_test_mutual_info = k_mi_best.transform(tng_test_data)

print('Performance after Mutual Information: ')
tng_mi_pred = tree_clf_tng_low_depth_mi.predict(tng_test_mutual_info)
accuracy = accuracy_score(tng_test_lbl, tng_mi_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
class_report = classification_report(tng_test_lbl, tng_mi_pred)
print("Classification Report:")
print(class_report)

Performance after Mutual Information: 
Accuracy: 0.1732
Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.08      0.10        97
           1       0.10      0.12      0.11       104
           2       0.57      0.47      0.51       115
           3       0.16      0.16      0.16       123
           4       0.13      0.10      0.11       126
           5       0.20      0.24      0.22       106
           6       0.25      0.25      0.25       109
           7       0.13      0.07      0.09       139
           8       0.04      0.03      0.04       122
           9       0.11      0.48      0.18       102
          10       0.12      0.11      0.11       108
          11       0.30      0.22      0.26       125
          12       0.13      0.13      0.13       114
          13       0.12      0.10      0.11       119
          14       0.13      0.09      0.11       127
          15       0.32      0.30      0.31       122
  

######Max Depth Decision Tree

In [None]:
# Performance of High Depth Decision Tree in HW3A-PB1

Accuracy: 0.4724
Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.40      0.40        97
           1       0.39      0.38      0.39       104
           2       0.44      0.52      0.48       115
           3       0.39      0.41      0.40       123
           4       0.47      0.30      0.37       126
           5       0.48      0.54      0.51       106
           6       0.50      0.53      0.52       109
           7       0.31      0.60      0.41       139
           8       0.57      0.52      0.55       122
           9       0.49      0.50      0.49       102
          10       0.59      0.51      0.54       108
          11       0.67      0.65      0.66       125
          12       0.38      0.33      0.36       114
          13       0.51      0.57      0.54       119
          14       0.60      0.45      0.51       127
          15       0.56      0.51      0.53       122
          16       0.46      0.44      0.

In [None]:
tng_test_chi2 = k_chi2_best.transform(tng_test_data)

print('Performance after chi2')
tng_chi2_pred = tree_clf_tng_max_depth_chi2.predict(tng_test_chi2)
accuracy = accuracy_score(tng_test_lbl, tng_chi2_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
class_report = classification_report(tng_test_lbl, tng_chi2_pred)
print("Classification Report:")
print(class_report)

Performance after chi2
Accuracy: 0.4295
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.29      0.34        97
           1       0.40      0.32      0.35       104
           2       0.53      0.36      0.42       115
           3       0.46      0.37      0.41       123
           4       0.57      0.27      0.37       126
           5       0.56      0.59      0.58       106
           6       0.50      0.49      0.49       109
           7       0.68      0.45      0.54       139
           8       0.69      0.47      0.56       122
           9       0.58      0.37      0.46       102
          10       0.75      0.51      0.61       108
          11       0.81      0.68      0.74       125
          12       0.26      0.13      0.18       114
          13       0.81      0.44      0.57       119
          14       0.69      0.48      0.57       127
          15       0.55      0.43      0.49       122
          16      

In [None]:
tng_test_mutual_info = k_mi_best.transform(tng_test_data)

print('Performance after Mutual Information: ')
tng_mi_pred = tree_clf_tng_max_depth_mi.predict(tng_test_mutual_info)
accuracy = accuracy_score(tng_test_lbl, tng_mi_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
class_report = classification_report(tng_test_lbl, tng_mi_pred)
print("Classification Report:")
print(class_report)

Performance after Mutual Information: 
Accuracy: 0.1710
Classification Report:
              precision    recall  f1-score   support

           0       0.09      0.08      0.09        97
           1       0.16      0.18      0.17       104
           2       0.54      0.45      0.49       115
           3       0.16      0.17      0.16       123
           4       0.12      0.11      0.11       126
           5       0.22      0.28      0.25       106
           6       0.23      0.25      0.24       109
           7       0.11      0.18      0.13       139
           8       0.04      0.03      0.04       122
           9       0.15      0.18      0.16       102
          10       0.11      0.11      0.11       108
          11       0.27      0.22      0.24       125
          12       0.11      0.11      0.11       114
          13       0.12      0.13      0.13       119
          14       0.12      0.09      0.11       127
          15       0.27      0.26      0.26       122
  