##PROBLEM 4 : L1 feature selection on text
Run a strongL1-regularized regression (library) on 20NG, and select 200 features (words) based on regression coefficients absolute value. Then reconstruct the dateaset with only these features, and rerun any of the classification tasks,

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import wordnet
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import Lasso

#### Preprocessing 20NG

In [None]:
# 20NG dataset
file_path = '/content/drive/MyDrive/USML/HW 3A/20NG/train_20NG_wo_header_footer_quotes.pkl'

with open(file_path, 'rb') as file:
    data = pickle.load(file)

def preprocess(text):
  def is_valid_word(word):
    return bool(wordnet.synsets(word))
  text = re.sub(r'[^a-zA-Z0-9 \n]', '', text) # removing any character that is not an alphanumeric character (letters and digits), a space, or a newline (\n)
  text = re.sub(r'\n+', ' ', text) # removing new line
  text = text.lower() # lower casing
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words] # removing stop words
  filtered_tokens = [word for word in tokens if is_valid_word(word)] # removing non-english word

  return ' '.join(filtered_tokens)

preprocessed_data = []

for text in data['data']:
  preprocessed_data.append(preprocess(text))

vectorizer = TfidfVectorizer()
tng_data = vectorizer.fit_transform(preprocessed_data) # SHAPE:

tng_labels = data['target']

num_nonzero_elements = tng_data.nnz
total_elements = tng_data.shape[0] * tng_data.shape[1]
sparsity_ratio = 1 - (num_nonzero_elements / total_elements)
print(f"Sparsity ratio: {sparsity_ratio:.4f}")
print("Shape: ",tng_data.shape)

Sparsity ratio: 0.9983
Shape:  (11314, 32863)


In [None]:
tng_train_data, tng_test_data, tng_train_lbl, tng_test_lbl = train_test_split(tng_data, tng_labels, test_size=0.2, random_state=42)

####Feature Selection using L1

In [None]:
l1_reg = Lasso(alpha=0.0027)
l1_reg.fit(tng_train_data, tng_train_lbl)

In [None]:
c=0
for n in l1_reg.coef_:
  if n != 0:
    c=c+1
print(c)

200


In [None]:
# Features selected by L1
feature_names = np.asarray(vectorizer.get_feature_names_out())

# Get the coefficients and their absolute values
coef = l1_reg.coef_
coef_abs = np.abs(coef)

# Get the indices of the top 200 features based on absolute coefficient values
top_200_indices = np.argsort(coef_abs)[-200:]

# Get the names of the top 200 features
top_200_features = feature_names[top_200_indices]
top_200_features

array(['product', 'new', 'orbit', 'fine', 'msg', 'mail', 'real', 'server',
       'islamic', 'define', 'hp', 'tiff', 'life', 'rosicrucian', 'offer',
       'area', 'believe', 'application', 'launch', 'tried', 'manager',
       'american', 'cant', 'get', 'research', 'person', 'display',
       'guess', 'also', 'hardware', 'nec', 'tv', 'desktop', 'live',
       'widget', 'fast', 'faster', 'best', 'many', 'colors', 'polygon',
       'world', 'low', 'satan', 'linux', 'bought', 'quran', 'war',
       'never', 'turkey', 'disk', 'political', 'screen', 'work', 'font',
       'board', 'hello', 'may', 'authority', 'truth', 'got', 'case',
       'mode', 'cpu', 'days', 'fonts', 'means', 'stop', 'running',
       'circuit', 'evidence', 'cards', 'food', 'bike', 'deletion', 'city',
       'available', 'years', 'doctor', 'pc', 'police', 'states', 'access',
       'said', 'population', 'try', 'motif', 'president', 'advance',
       'peace', 'shipping', 'federal', 'please', 'drives', 'port', 'man',
    

In [None]:
# Reconstructing the dataset
tng_train_data_selected = tng_train_data[:, top_200_indices]

###Logistic Regression on 20NG

In [None]:
log_reg = LogisticRegression(penalty='l2', max_iter=1000,solver='lbfgs', multi_class='multinomial')
log_reg.fit(tng_train_data_selected, tng_train_lbl)

#### Classification performance

In [None]:
# Performance of Logistic Regression in HW3A-PB1


Accuracy: 0.7163
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.67      0.68        97
           1       0.65      0.69      0.67       104
           2       0.68      0.65      0.67       115
           3       0.65      0.65      0.65       123
           4       0.79      0.55      0.65       126
           5       0.72      0.82      0.77       106
           6       0.69      0.72      0.71       109
           7       0.75      0.73      0.74       139
           8       0.50      0.80      0.62       122
           9       0.71      0.78      0.74       102
          10       0.88      0.79      0.83       108
          11       0.92      0.83      0.87       125
          12       0.62      0.65      0.64       114
          13       0.76      0.78      0.77       119
          14       0.76      0.83      0.79       127
          15       0.72      0.77      0.75       122
          16       0.72      0.77      0

In [None]:
tng_test_l1_200 = tng_test_data[:, top_200_indices]

print('Performance after L1 feature selection')
tng_l1_pred = log_reg.predict(tng_test_l1_200)
accuracy = accuracy_score(tng_test_lbl, tng_l1_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
class_report = classification_report(tng_test_lbl, tng_l1_pred)
print("Classification Report:")
print(class_report)

Performance after L1 feature selection
Accuracy: 0.4658
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.24      0.36        97
           1       0.52      0.52      0.52       104
           2       0.63      0.61      0.62       115
           3       0.52      0.54      0.53       123
           4       0.67      0.38      0.48       126
           5       0.62      0.71      0.66       106
           6       0.49      0.57      0.53       109
           7       0.75      0.41      0.53       139
           8       0.79      0.30      0.44       122
           9       0.14      0.62      0.23       102
          10       0.20      0.34      0.25       108
          11       0.47      0.55      0.51       125
          12       0.31      0.25      0.27       114
          13       0.53      0.50      0.51       119
          14       0.82      0.50      0.62       127
          15       0.58      0.66      0.62       122
  