In [11]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
import warnings

warnings.filterwarnings('ignore')

## Data

In [2]:
X, y = fetch_20newsgroups_vectorized(subset="all", return_X_y=True)

In [3]:
# Turn down for faster run time
n_samples = 5000
X = X[:n_samples]
y = y[:n_samples]

In [4]:
print('Shape of X',X.shape)
print('Shape of y',y.shape)

Shape of X (5000, 130107)
Shape of y (5000,)


In [5]:
# We use SAGA solver
solver = "saga"
# Saga solver support l1 and l2 regularization 
# Saga solver support multinomial and OVR

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, stratify=y, test_size=0.1
)

In [7]:
train_samples, n_features = X_train.shape
n_classes = np.unique(y).shape[0]
print('Train samples :',train_samples)
print('Train features :',n_features)
print('Classes :',n_classes)

Train samples : 4500
Train features : 130107
Classes : 20


## Logistic Regression -- One vs Rest  -- SAGA solver with l1 penalty

In [18]:
print(f'Model Logistic regression - One vs Rest - Solver: Saga ')

for i in [1, 2, 3, 5]:
    lr_ovr = LogisticRegression(solver='saga', multi_class='ovr', penalty='l1', max_iter=i).fit(X_train, y_train)

    y_pred = lr_ovr.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    
    print(f'max_iter:{i} Accuracy score {acc}')

Model Logistic regression - One vs Rest - Solver: Saga 
max_iter:1 Accuracy score 0.558
max_iter:2 Accuracy score 0.576
max_iter:3 Accuracy score 0.634
max_iter:5 Accuracy score 0.632


## Logistic Regression -- multinomial (multiclass) model  -- SAGA solver  with l1 penalty

In [21]:
print(f'Model Logistic regression - multinomial (multiclass) model - Solver: Saga ')

for i in [1, 2, 3, 5]:
    lr_ovr = LogisticRegression(solver='saga', multi_class='multinomial', penalty='l1', max_iter=i).fit(X_train, y_train)

    y_pred = lr_ovr.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    
    print(f'max_iter:{i} Accuracy score {acc}')

Model Logistic regression - multinomial (multiclass) model - Solver: Saga 
max_iter:1 Accuracy score 0.446
max_iter:2 Accuracy score 0.582
max_iter:3 Accuracy score 0.59
max_iter:5 Accuracy score 0.634
