In [1]:
import re
import pandas as pd
import numpy as np
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import plotly.express as px
from sklearn.preprocessing import LabelEncoder



# Loading data

In [2]:
X = pd.read_csv('data/x_train.csv')
y = pd.read_csv('data/y_train.csv')

In [3]:
X=X['text_lemmatized']
y=y['scoreSentiment']

In [4]:
na_indices = X[X.isna()].index  # Assuming you want to drop rows with missing values in X

# Drop rows from X and y based on na_indices
X = X.drop(na_indices)
y = y.drop(na_indices)

# Print the lengths of X and y after dropping rows
print(len(X), len(y))

142787 142787


# Creating and trying a first model

In [5]:
class Model:
    def __init__(self, X, y, model_architecture, vectorizer, random_seed=42, test_size=0.2) -> None:
        self.X = X
        self.y = y
        self.model_instance = model_architecture
        self.vectorizer = vectorizer
        self.random_seed = random_seed
        self.test_size = test_size

        self.pipeline = Pipeline([("Vectorizer", self.vectorizer), ("Model_Architecture", self.model_instance)])# the pipeline as defined previously

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=self.random_seed)# train test split using the above X, y, test_size and random_state

    def preprocess(self, text):
        def remove_twitter_handles_url(text):
            twitter_handle_pattern = r'@[\w_]+'
            url_pattern = r'https?://\S+|www\.\S+'
            
            no_handle = re.sub(twitter_handle_pattern, '', text)
            cleaned_text = re.sub(url_pattern, '', no_handle)

            return cleaned_text
        
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        cleaned_text = remove_twitter_handles_url(text)
        tokens = word_tokenize(cleaned_text)
        words = [word.lower() for word in tokens if word.isalnum()]
        words = [word for word in words if word not in stop_words]
        words = [lemmatizer.lemmatize(word) for word in words]
        
        return ' '.join(words)
    
    def fit(self):
        self.pipeline.fit(self.X_train, self.y_train) # fit self.pipeline to the training data

    def predict(self):
        return self.pipeline.predict(self.X_test)

    
    def predict_proba(self):
        return self.pipeline.predict_proba(self.X_test)

        
    
    def report(self, class_labels):
        print(classification_report(self.y_test, self.predict(), target_names=class_labels))
        Confusion_matrix = confusion_matrix(self.y_test, self.predict())
        fig = px.imshow(
            Confusion_matrix,
            color_continuous_scale='Blues', 
            x=class_labels,
            y=class_labels,
            labels={'x':"predicted",'y':"true label"}
            )
        fig.show()
        
        


In [10]:
model2 = Model(X,y, LogisticRegression(), TfidfVectorizer(), random_seed=42, test_size=0.2) 
model2.fit() 
label_encoder = LabelEncoder() 
df = pd.DataFrame()
df['Sentiment_enc'] = label_encoder.fit_transform(y)
class_labels=label_encoder.inverse_transform(range(2))
model2.report(class_labels)

              precision    recall  f1-score   support

    NEGATIVE       0.77      0.59      0.67      9561
    POSITIVE       0.82      0.91      0.86     18997

    accuracy                           0.80     28558
   macro avg       0.79      0.75      0.76     28558
weighted avg       0.80      0.80      0.80     28558



# Conclusion

Here we can see that we have twice as more positive value than negative, meybe our model is over fitting. We need to check with other models and then tune the hyper parameters