In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
import time
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
import numpy as np

In [2]:
df = pd.read_csv('../Data/CleanedData.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer(max_features=5000)

In [4]:
X= tfidf.fit_transform(df['transformed text'])
y= df['subject'].values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=0.2, random_state=42)

In [6]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [7]:
def objective(params):
    # Define hyperparameters
    penalty = params['penalty']
    C = params['C']
    solver = params['solver']
    max_iter = params['max_iter']
    
    # Exclude 'l1' penalty if the solver is 'lbfgs'
    if solver == 'lbfgs' and penalty == 'l1':
        return {'loss': 0, 'status': STATUS_OK}
    
    # Create a Logistic Regression classifier with the specified hyperparameters
    clf = LogisticRegression(penalty=penalty, C=C, solver=solver, max_iter=max_iter,  random_state=42)
    
    # Train the classifier
    clf.fit(X_resampled,y_resampled)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate F1 score
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return {'loss': -f1, 'status': STATUS_OK}

In [8]:
space = {
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'C': hp.loguniform('C', np.log(1e-3), np.log(1e3)),
    'max_iter': hp.choice('max_iter', [100, 200, 300, 400, 500]),  # Higher max_iter values
    'solver': hp.choice('solver', ['liblinear', 'lbfgs', 'saga']),
}

In [9]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,  # Number of optimization iterations
            trials=trials,
            rstate=np.random.default_rng(42))

# Get the best hyperparameters and the corresponding loss
best_hyperparams = space_eval(space, best)
best_loss = -trials.best_trial['result']['loss']

# Print the best hyperparameters and corresponding loss
print("Best Hyperparameters:")
print(best_hyperparams)
print("Best Weighted F1 Score (Loss):", best_loss)

  4%|▏     | 2/50 [18:45<8:40:21, 650.45s/trial, best loss: -0.7343512536103644]




 52%|██▌  | 26/50 [47:30<1:57:40, 294.21s/trial, best loss: -0.7614495774663261]




 62%|███  | 31/50 [1:00:29<50:13, 158.58s/trial, best loss: -0.7649167943176249]




 64%|█▉ | 32/50 [1:25:44<2:48:55, 563.10s/trial, best loss: -0.7649167943176249]




100%|█████| 50/50 [1:32:13<00:00, 110.68s/trial, best loss: -0.7649167943176249]
Best Hyperparameters:
{'C': 0.6857776654818881, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
Best Weighted F1 Score (Loss): 0.7649167943176249


In [11]:
lr = LogisticRegression(C= 0.6857776654818881,penalty = 'l1',solver = 'saga')
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
acc = accuracy_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred, average ='weighted')
print('Accuracy: ', acc)
print('F1 Score: ', f1)

Accuracy:  0.7881739130434783
F1 Score:  0.7660676025880255




In [None]:
from sklearn.svm import SVC
sv = SVC()
sv.fit(X_resampled,y_resampled)
y_pred = sv.predict(X_test)
acc = accuracy_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred, average ='weighted')
print('Accuracy: ', acc)
print('F1 Score: ', f1)

(84288, 5000)