In [22]:
import re
import pandas as pd
import numpy as np
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import plotly.express as px
from sklearn.preprocessing import LabelEncoder



In [23]:
X = pd.read_csv('data/x_train.csv')
y = pd.read_csv('data/y_train.csv')

In [25]:
X.head()

Unnamed: 0,text_lemmatized
0,time long enough youngster brief attention spa...
1,doesnt matter cost 300 million 300 dollar good...
2,choreography precise lifelike point might wond...
3,film outoftouch attempt humor may find hunt re...
4,clumsy determination endear sometimes wildly e...


In [26]:
y.head()

Unnamed: 0,scoreSentiment
0,POSITIVE
1,NEGATIVE
2,POSITIVE
3,NEGATIVE
4,POSITIVE


In [27]:
X=X['text_lemmatized']
y=y['scoreSentiment']

In [28]:
class Model:
    def __init__(self, X, y, model_architecture, vectorizer, random_seed=42, test_size=0.2) -> None:
        self.X = X
        self.y = y
        self.model_instance = model_architecture
        self.vectorizer = vectorizer
        self.random_seed = random_seed
        self.test_size = test_size

        self.pipeline = Pipeline([("Vectorizer", self.vectorizer), ("Model_Architecture", self.model_instance)])# the pipeline as defined previously

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=self.random_seed)# train test split using the above X, y, test_size and random_state

    def preprocess(self, text):
        def remove_twitter_handles_url(text):
            twitter_handle_pattern = r'@[\w_]+'
            url_pattern = r'https?://\S+|www\.\S+'
            
            no_handle = re.sub(twitter_handle_pattern, '', text)
            cleaned_text = re.sub(url_pattern, '', no_handle)

            return cleaned_text
        
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        cleaned_text = remove_twitter_handles_url(text)
        tokens = word_tokenize(cleaned_text)
        words = [word.lower() for word in tokens if word.isalnum()]
        words = [word for word in words if word not in stop_words]
        words = [lemmatizer.lemmatize(word) for word in words]
        
        return ' '.join(words)
    
    def fit(self):
        self.pipeline.fit(self.X_train, self.y_train) # fit self.pipeline to the training data

    def predict(self):
        return self.pipeline.predict(self.X_test)

    
    def predict_proba(self):
        return self.pipeline.predict_proba(self.X_test)

        
    
    def report(self, class_labels):
        # the report function as defined previously
        print(classification_report(self.y_test, self.predict(), target_names=class_labels))
        # print a classification report of the predictions # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn-metrics-classification-report
        # create a confusion matrix and pass it to imshow to visualize it # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix
        # (the confusion_matrix_kwargs are here for styling only)
        Confusion_matrix = confusion_matrix(self.y_test, self.predict())
        fig = px.imshow(
            Confusion_matrix,
            color_continuous_scale='Blues', 
            x=class_labels,
            y=class_labels,
            labels={'x':"predicted",'y':"true label"}
            )
        fig.show()
        
        


In [5]:
len(X)

10000

In [17]:
len(y)


AttributeError: 'DataFrame' object has no attribute 'reshape'

In [29]:
na_indices = X[X.isna()].index  # Assuming you want to drop rows with missing values in X

# Drop rows from X and y based on na_indices
X = X.drop(na_indices)
y = y.drop(na_indices)

# Print the lengths of X and y after dropping rows
print(len(X), len(y))


9431 9431


In [20]:
# X= X.to_numpy()
# y= y.to_numpy()

AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [31]:
model2 = Model(X[:10000], y[:10000], LogisticRegression(), TfidfVectorizer(), random_seed=42, test_size=0.2) # instantiate the Model class with text and labels (X and y), a logistic regression model and a tfidf vectorizer
model2.fit() # fit the model
label_encoder = LabelEncoder() # instantiate a label encoder # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
df = pd.DataFrame()
df['Sentiment_enc'] = label_encoder.fit_transform(y)# fit and transform the encoder on labels
class_labels=label_encoder.inverse_transform(range(2))
model2.report(class_labels) # predict and generate classification report

              precision    recall  f1-score   support

    NEGATIVE       0.80      0.42      0.55       638
    POSITIVE       0.76      0.95      0.84      1249

    accuracy                           0.77      1887
   macro avg       0.78      0.68      0.70      1887
weighted avg       0.77      0.77      0.75      1887

