In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
import time
import pandas as pd
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('data/CleanedData.csv')

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=5000)

In [4]:
x= cv.fit_transform(df['transformed text']).toarray()
y= df['subject'].values

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
model_params= {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 10],
            'penalty': ['l1', 'l2'],
             'solver': ['liblinear', 'saga'],      
                  }
    }
}

In [8]:
scores= []
start = time.time()
for model_name, mp in model_params.items():
    clf= GridSearchCV(mp['model'], mp['params'], cv=2, verbose = 4, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
best_model = clf.best_estimator_
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
lass_report = classification_report(y_test, y_pred)
    
print(f"{clf_name} - Test Accuracy:", accuracy)
print(f"{clf_name} - Confusion Matrix:\n", conf_matrix)
print(f"{clf_name} - Classification Report:\n", class_report)
print("\n")

end = time.time()
print("The time of execution of above program is :",
      (end-start) * 10**3, "ms")

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV 1/2] END C=0.001, penalty=l1, solver=liblinear;, score=0.648 total time=   1.5s
[CV 2/2] END C=0.001, penalty=l1, solver=liblinear;, score=0.660 total time=   1.5s




[CV 1/2] END ..C=0.001, penalty=l1, solver=saga;, score=0.655 total time= 3.6min




[CV 2/2] END ..C=0.001, penalty=l1, solver=saga;, score=0.646 total time= 3.6min
[CV 1/2] END C=0.001, penalty=l2, solver=liblinear;, score=0.752 total time=   2.4s
[CV 2/2] END C=0.001, penalty=l2, solver=liblinear;, score=0.762 total time=   2.5s




[CV 1/2] END ..C=0.001, penalty=l2, solver=saga;, score=0.761 total time= 2.7min




[CV 2/2] END ..C=0.001, penalty=l2, solver=saga;, score=0.767 total time= 2.6min
[CV 1/2] END C=0.01, penalty=l1, solver=liblinear;, score=0.739 total time=   1.4s
[CV 2/2] END C=0.01, penalty=l1, solver=liblinear;, score=0.747 total time=   1.5s




[CV 1/2] END ...C=0.01, penalty=l1, solver=saga;, score=0.733 total time= 3.6min




[CV 2/2] END ...C=0.01, penalty=l1, solver=saga;, score=0.747 total time= 3.6min
[CV 1/2] END C=0.01, penalty=l2, solver=liblinear;, score=0.765 total time=   4.6s
[CV 2/2] END C=0.01, penalty=l2, solver=liblinear;, score=0.774 total time=   4.3s




[CV 1/2] END ...C=0.01, penalty=l2, solver=saga;, score=0.765 total time= 2.6min




[CV 2/2] END ...C=0.01, penalty=l2, solver=saga;, score=0.771 total time= 2.6min
[CV 1/2] END C=0.1, penalty=l1, solver=liblinear;, score=0.770 total time=   1.7s
[CV 2/2] END C=0.1, penalty=l1, solver=liblinear;, score=0.778 total time=   1.7s




[CV 1/2] END ....C=0.1, penalty=l1, solver=saga;, score=0.764 total time= 5.5min




[CV 2/2] END ....C=0.1, penalty=l1, solver=saga;, score=0.775 total time= 5.0min
[CV 1/2] END C=0.1, penalty=l2, solver=liblinear;, score=0.751 total time=  11.3s




[CV 2/2] END C=0.1, penalty=l2, solver=liblinear;, score=0.759 total time=  12.0s




[CV 1/2] END ....C=0.1, penalty=l2, solver=saga;, score=0.765 total time= 2.7min




[CV 2/2] END ....C=0.1, penalty=l2, solver=saga;, score=0.769 total time= 2.8min
[CV 1/2] END .C=1, penalty=l1, solver=liblinear;, score=0.737 total time=   6.6s
[CV 2/2] END .C=1, penalty=l1, solver=liblinear;, score=0.745 total time=   5.4s




[CV 1/2] END ......C=1, penalty=l1, solver=saga;, score=0.765 total time= 9.3min




[CV 2/2] END ......C=1, penalty=l1, solver=saga;, score=0.770 total time= 9.7min




[CV 1/2] END .C=1, penalty=l2, solver=liblinear;, score=0.733 total time=  24.6s




[CV 2/2] END .C=1, penalty=l2, solver=liblinear;, score=0.744 total time=  20.5s




[CV 1/2] END ......C=1, penalty=l2, solver=saga;, score=0.766 total time= 2.7min




[CV 2/2] END ......C=1, penalty=l2, solver=saga;, score=0.768 total time= 2.9min
[CV 1/2] END C=10, penalty=l1, solver=liblinear;, score=0.700 total time=  36.6s
[CV 2/2] END C=10, penalty=l1, solver=liblinear;, score=0.707 total time=  34.2s




[CV 1/2] END .....C=10, penalty=l1, solver=saga;, score=0.765 total time= 8.3min




[CV 2/2] END .....C=10, penalty=l1, solver=saga;, score=0.769 total time= 8.2min




[CV 1/2] END C=10, penalty=l2, solver=liblinear;, score=0.736 total time=  19.9s




[CV 2/2] END C=10, penalty=l2, solver=liblinear;, score=0.745 total time=  18.9s




[CV 1/2] END .....C=10, penalty=l2, solver=saga;, score=0.765 total time= 2.7min




[CV 2/2] END .....C=10, penalty=l2, solver=saga;, score=0.768 total time= 2.7min


NameError: name 'clf_name' is not defined

In [9]:
df= pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,LogisticRegression,0.774,"{'C': 0.1, 'penalty': 'l1', 'solver': 'libline..."


In [13]:
best_model = clf.best_estimator_
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [17]:
print('Accuracy score',accuracy)
print('Confusion Matrix:\n',conf_matrix)
print('Report:\n',class_report)

Accuracy score 0.7895652173913044
Confusion Matrix:
 [[  21   20    0   48  192    6]
 [   5 1717    2   34  116    2]
 [   1    7  139    6   11    1]
 [  12   91    2  154  568    8]
 [  38  116    3  270 2930  118]
 [   0    0    0    0  138 1849]]
Report:
               precision    recall  f1-score   support

           0       0.27      0.07      0.12       287
           1       0.88      0.92      0.90      1876
           2       0.95      0.84      0.89       165
           3       0.30      0.18      0.23       835
           4       0.74      0.84      0.79      3475
           5       0.93      0.93      0.93      1987

    accuracy                           0.79      8625
   macro avg       0.68      0.63      0.64      8625
weighted avg       0.76      0.79      0.77      8625

