In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Loading the Data

In [2]:
X = pd.read_csv('data/x_train.csv')
y = pd.read_csv('data/y_train.csv')
X=X['text_lemmatized']
y=y['scoreSentiment']
na_indices = X[X.isna()].index  # Assuming you want to drop rows with missing values in X

# Drop rows from X and y based on na_indices
X = X.drop(na_indices)
y = y.drop(na_indices)
replacement_dict = {"POSITIVE": 1, "NEGATIVE": 0}

# Replace values using the map function
Y = y.map(replacement_dict)

# Print the lengths of X and y after dropping rows
print(len(X), len(Y))
X_train, X_test, y_train, y_test = train_test_split(X[:10000], Y[:10000], test_size=0.2, random_state=42)

142787 142787


# Choosing the best model

We are going to run our data through differents models and see wich one has the best accuracy and f1 score

In [3]:
from itertools import product
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import plotly.express as px
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

def generate_report(y_test, predict):
    label_encoder = LabelEncoder()
    df = pd.DataFrame()
    df['Sentiment_enc'] = label_encoder.fit_transform(y)
    class_labels=label_encoder.inverse_transform(range(2))
    print(classification_report(y_test, predict, target_names=class_labels))
    confusion_matrix_data = confusion_matrix(y_test, predict)
    fig = px.imshow(
        confusion_matrix_data,
        color_continuous_scale='Blues',
        x=class_labels,
        y=class_labels,
        labels={'x': "predicted", 'y': "true label"}
    )
    fig.show()

models = [
    ('SVC', SVC(random_state=42)),
    ('KNN', KNeighborsClassifier()),
    ('XGBoost', XGBClassifier(random_state=42)),
    ('LogisticRegression', LogisticRegression()),
    ('RandomForest', RandomForestClassifier(random_state=42)),
    ('NaiveBayes', MultinomialNB()),
    ('GradientBoosting', GradientBoostingClassifier(random_state=42)),
    ('DecisionTree', DecisionTreeClassifier(random_state=42)),
    ('SVM_Poly', SVC(kernel='poly', random_state=42)),
    ('KNN_Weighted', KNeighborsClassifier(weights='distance')),
]

vectorizers = [
    ('BoW', CountVectorizer()),
    ('TF-IDF', TfidfVectorizer())
]

best_model = None
best_score = 0
best_vectorizer = None

for model, vectorizer in product(models, vectorizers):
    model_name, model_instance = model
    vectorizer_name, vectorizer_instance = vectorizer

    pipeline = Pipeline([
        ('Vectorize', vectorizer_instance),
        ('Model', model_instance)
    ])

    pipeline.fit(X_train, y_train)

    score = pipeline.score(X_test, y_test)

    if score > best_score:
        best_score = score
        best_model = model_name
        best_vectorizer = vectorizer_name
        best_pipeline = pipeline

print(f'Best model: {best_model}')
print(f'Best vectorizer: {best_vectorizer}')

best_pipeline.fit(X_train, y_train)

print(f'accuracy_score: {accuracy_score(y_test, best_pipeline.predict(X_test))}\n')
print(f'Classification Report:\n{classification_report(y_test, best_pipeline.predict(X_test))}')

generate_report(y_test, best_pipeline.predict(X_test))



Best model: NaiveBayes
Best vectorizer: BoW
accuracy_score: 0.7755

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.51      0.60       668
           1       0.79      0.91      0.84      1332

    accuracy                           0.78      2000
   macro avg       0.76      0.71      0.72      2000
weighted avg       0.77      0.78      0.76      2000

              precision    recall  f1-score   support

    NEGATIVE       0.74      0.51      0.60       668
    POSITIVE       0.79      0.91      0.84      1332

    accuracy                           0.78      2000
   macro avg       0.76      0.71      0.72      2000
weighted avg       0.77      0.78      0.76      2000



# Finding best hyper parameters

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.1, 0.5,0.8, 1.0, 1.2, 1.5],
    'fit_prior': [True, False],
    'class_prior': [None, [0.3, 0.7], [0.5, 0.5]],
}

index_of_model = next(i for i, (name, _) in enumerate(models) if name == best_model)
model_good = models[index_of_model][1]

index_of_vect = next(i for i, (name, _) in enumerate(vectorizers) if name == best_vectorizer)
vectorizer_good = vectorizers[index_of_vect][1]

X_train_vectorized = vectorizer_good.fit_transform(X_train)

grid_search = GridSearchCV(estimator=model_good, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train_vectorized, y_train)  

best_params_GRCV = grid_search.best_params_
best_model_GRCV = grid_search.best_estimator_
best_score_GRCV = grid_search.best_score_

print(f'Best params: {best_params_GRCV} \nBest score: {best_score_GRCV}')


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best params: {'alpha': 1.0, 'class_prior': None, 'fit_prior': True} 
Best score: 0.7715000000000001


Here our parameters tuning does not improve our accuracy. We will try to train the model with all the values of X_train and notrt only 30000