In [3]:
# Importing Libraries
import json
import string
import random
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn import svm
from sklearn.svm import SVC

# Loading Dataset
df = pd.read_csv('Dataset_Cognitive_Distortions.tsv', sep='\t', header=0)
#df.shape

# Divide data between training and test data
X = df['Phrase']
y = df['Cognitive Distortion']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .8)


MULTINOMIAL NAIVE BAYES

In [None]:
textclassifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=0)),
    ('mnb', MultinomialNB())
])

# Hyperparameters to tune
params = {'smote__k_neighbors': [2,3,4,5,6,7,8,9,10],
          'mnb__alpha': [0.01, 0.1, 0.3, 0.5, 1.0]
         }

# Hyperprameters tuning
multinomial_nb_grid = GridSearchCV(estimator=textclassifier, param_grid=params, n_jobs=10, cv=10, verbose=5)
multinomial_nb_grid.fit(X_train, y_train)

# Print results with 3 decimals
print('Train Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%multinomial_nb_grid.best_score_)
print('Best Parameters : ',multinomial_nb_grid.best_params_)

y_pred = multinomial_nb_grid.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

Fitting 10 folds for each of 45 candidates, totalling 450 fits
Train Accuracy : 0.994
Test Accuracy : 0.689
Best Accuracy Through Grid Search : 0.683
Best Parameters :  {'mnb__alpha': 0.1, 'smote__k_neighbors': 10}
[[ 3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  5  0  1  0  0  0  0  0  0  0  0  0  1  0  0]
 [ 0  0  5  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  2  1  7  0  0  0  0  0  1  0  2  0  0  0  0]
 [ 0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  1  0  1  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  6  0  1  0  2  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  9  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  5  0  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  2  0  0  4  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  1  0  1  1  4  1  1  2  0]
 [ 0  1  0  1  0  1  0  0  0  0  0  0  5  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  5  0  0]
 [ 0  0  0  0  0  0  0  1  1  0  0  0  1  0  3  0]
 [ 0  0  0  0  0  0 

MULTINOMIAL LOGISTIC REGRESSION

In [4]:
textclassifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=0)),
    ('mlg', LogisticRegression( multi_class='multinomial', # Set algorithm to multinomial since this is a multi class problem
                                random_state=0, # Set the seed for the algorithm to make sure there is consistency across multiple runs
                                warm_start = True, # To make the model converge faster
                                l1_ratio = 0.5)) # Set to the elastic net regularization
])

params = {'smote__k_neighbors': [2,3,4,5,6,7,8,9,10],
          'mlg__penalty': ['l1', 'l2', 'elasticnet', 'none' ],
          'mlg__C': [0.01, 0.1, 0.3, 0.5, 1.0,],
          'mlg__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'] # List of compatible solver for multi_class = multinomial
         }

multinomial_lg_grid = GridSearchCV(estimator=textclassifier, param_grid=params, n_jobs=10, cv=10, verbose=5)
multinomial_lg_grid.fit(X_train, y_train)

print('Train Accuracy : %.3f'%multinomial_lg_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%multinomial_lg_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%multinomial_lg_grid.best_score_)
print('Best Parameters : ',multinomial_lg_grid.best_params_)

y_pred = multinomial_lg_grid.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


2700 fits failed out of a total of 7200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
450 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/imblearn/pipeline.py", line 297, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _

Train Accuracy : 0.998
Test Accuracy : 0.647
Best Accuracy Through Grid Search : 0.687
Best Parameters :  {'mlg__C': 0.01, 'mlg__penalty': 'none', 'mlg__solver': 'saga', 'smote__k_neighbors': 2}
[[ 3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  0  1  0  0  2  0  0  0  0  0  0  1  0  0]
 [ 0  0  5  0  0  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  2  1  7  0  0  0  0  0  1  0  2  0  0  0  0]
 [ 0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  1  0  2  0  1  1  0  0  0]
 [ 0  0  0  0  0  0  0  4  0  1  0  4  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  9  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  0  5  0  0  0  0  0  0]
 [ 0  0  0  1  1  0  0  0  0  0  5  0  1  0  1  0]
 [ 0  0  0  0  0  0  1  1  0  1  1  4  1  1  1  0]
 [ 1  0  0  0  0  0  0  0  0  0  1  0  6  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  5  0  0]
 [ 0  0  1  1  0  0  0  1  0  0  0  0  1  0  2  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0

