In [1]:
!pip install spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
import joblib
from sklearn.metrics import accuracy_score



In [2]:
column_names = ["id", "topic", "sentiment", "text"]
df = pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv", names=column_names)
df.head()

Unnamed: 0,id,topic,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
df.shape

(74682, 4)

In [4]:
df = df[["sentiment", "text"]]
df.head()

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
df.sentiment.value_counts()

sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [6]:
# We are not interested in "Irrelevant" sentiment. So we're going to drop thease rows
df = df[df.sentiment != "Irrelevant"]
df.sentiment.value_counts()

sentiment
Negative    22542
Positive    20832
Neutral     18318
Name: count, dtype: int64

In [7]:
df.isna().sum()

sentiment      0
text         571
dtype: int64

In [8]:
df = df.copy()
df.dropna(inplace=True)
df.isna().sum()

sentiment    0
text         0
dtype: int64

In [9]:
sentiment_numbers = {"Neutral": 0, "Negative": 1, "Positive": 2}

df["sentiment_num"] = df.sentiment.map(sentiment_numbers)
df.head()

Unnamed: 0,sentiment,text,sentiment_num
0,Positive,im getting on borderlands and i will murder yo...,2
1,Positive,I am coming to the borders and I will kill you...,2
2,Positive,im getting on borderlands and i will kill you ...,2
3,Positive,im coming on borderlands and i will murder you...,2
4,Positive,im getting on borderlands 2 and i will murder ...,2


In [10]:
!python -m spacy download en_core_web_md
# remove stop words 
nlp = spacy.load("en_core_web_md")

def preprocess(text):
  doc = nlp(text)
  tokens = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    else:
      tokens.append(token.lemma_)
  return " ".join(tokens)
df["text_preprocessed"] = df.text.apply(preprocess)
df.head()

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Unnamed: 0,sentiment,text,sentiment_num,text_preprocessed
0,Positive,im getting on borderlands and i will murder yo...,2,m get borderland murder
1,Positive,I am coming to the borders and I will kill you...,2,come border kill
2,Positive,im getting on borderlands and i will kill you ...,2,m get borderland kill
3,Positive,im coming on borderlands and i will murder you...,2,m come borderland murder
4,Positive,im getting on borderlands 2 and i will murder ...,2,m get borderland 2 murder


In [11]:
df.sentiment.value_counts()

sentiment
Negative    22358
Positive    20655
Neutral     18108
Name: count, dtype: int64

In [12]:
#splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df.text, df.sentiment_num, test_size=0.2, random_state=42, stratify=df.sentiment_num)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(48896,) (48896,)
(12225,) (12225,)


In [13]:
column_names = ["id", "platform", "sentiment", "text"]
df_val = pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv", names=column_names)
df_val.head()

Unnamed: 0,id,platform,sentiment,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [14]:
df_val.isna().sum()

id           0
platform     0
sentiment    0
text         0
dtype: int64

In [15]:
df_val = df_val[["sentiment", "text"]]
df_val.head()

Unnamed: 0,sentiment,text
0,Irrelevant,I mentioned on Facebook that I was struggling ...
1,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,Negative,@Microsoft Why do I pay for WORD when it funct...
3,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,Neutral,Now the President is slapping Americans in the...


In [16]:
df_val.sentiment.value_counts()

sentiment
Neutral       285
Positive      277
Negative      266
Irrelevant    172
Name: count, dtype: int64

In [17]:
df_val = df_val[df_val.sentiment != "Irrelevant"]
df_val.sentiment.value_counts()

sentiment
Neutral     285
Positive    277
Negative    266
Name: count, dtype: int64

In [18]:
sentiment_numbers = {"Neutral": 0, "Negative": 1, "Positive": 2}

df_val["sentiment_num"] = df_val.sentiment.map(sentiment_numbers)
df_val.head()

Unnamed: 0,sentiment,text,sentiment_num
1,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,0
2,Negative,@Microsoft Why do I pay for WORD when it funct...,1
3,Negative,"CSGO matchmaking is so full of closet hacking,...",1
4,Neutral,Now the President is slapping Americans in the...,0
5,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,1


In [19]:
X_val = df_val["text"]
y_val = df_val["sentiment_num"]
print(X_val.shape)
print(y_val.shape)

(828,)
(828,)


In [20]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming X_train, X_val, y_train, y_val are already defined from train_test_split

# Define pipelines for Random Forest, XGBoost, and KNN with different vectorizers
clf_1 = Pipeline([('vectorizer', CountVectorizer(analyzer='word', ngram_range=(1,1))),
                  ('classifier', RandomForestClassifier())])
clf_2 = Pipeline([('vectorizer', CountVectorizer(analyzer='word', ngram_range=(1,2))),
                  ('classifier', RandomForestClassifier())])
clf_3 = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1))),
                  ('classifier', RandomForestClassifier())])

xgboost_with_CountVectorizer11 = Pipeline([('vectorizer', CountVectorizer(analyzer='word', ngram_range=(1,1))),
                                          ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))])
xgboost_with_CountVectorizer12 = Pipeline([('vectorizer', CountVectorizer(analyzer='word', ngram_range=(1,2))),
                                          ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))])
xgboost_with_TfidfVectorizer = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1))),
                                        ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))])

knn_with_CountVectorizer11 = Pipeline([('vectorizer', CountVectorizer(analyzer='word', ngram_range=(1,1))),
                                      ('classifier', KNeighborsClassifier())])
knn_with_CountVectorizer12 = Pipeline([('vectorizer', CountVectorizer(analyzer='word', ngram_range=(1,2))),
                                      ('classifier', KNeighborsClassifier())])
knn_with_TfidfVectorizer = Pipeline([('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,1))),
                                    ('classifier', KNeighborsClassifier())])

# List of all models
models = [
    clf_1, clf_2, clf_3,
    xgboost_with_CountVectorizer11, xgboost_with_CountVectorizer12, xgboost_with_TfidfVectorizer,
    knn_with_CountVectorizer11, knn_with_CountVectorizer12, knn_with_TfidfVectorizer
]

# Train each model
for model in models:
    model.fit(X_train, y_train)

import os

# Directory to save plots and models
os.makedirs("confusion_matrices", exist_ok=True)
os.makedirs("saved_models", exist_ok=True)

best_score = 0
best_model = None
best_model_name = ""
best_y_pred = None

for i, model in enumerate(models):
    y_pred_val = model.predict(X_val)

    # Get model names
    classifier_name = model.named_steps['classifier'].__class__.__name__
    vectorizer_name = model.named_steps['vectorizer'].__class__.__name__
    
    # Print classification report
    print(f"\nClassification Report for {classifier_name} with {vectorizer_name}:")
    print(classification_report(y_val, y_pred_val))
    
    # Save confusion matrix plot
    cm_val = confusion_matrix(y_val, y_pred_val)
    sns.heatmap(cm_val, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix: {classifier_name} + {vectorizer_name}")
    
    filename = f"{i+1}_{classifier_name}_{vectorizer_name}.png".replace(" ", "_")
    plt.savefig(os.path.join("confusion_matrices", filename))
    plt.close()
    
    # Compute accuracy or another metric
    acc = accuracy_score(y_val, y_pred_val)

    # Track best model
    if acc > best_score:
        best_score = acc
        best_model = model
        best_model_name = f"{classifier_name}_{vectorizer_name}"
        best_y_pred = y_pred_val



Classification Report for RandomForestClassifier with CountVectorizer:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       285
           1       0.97      0.98      0.97       266
           2       0.99      0.97      0.98       277

    accuracy                           0.97       828
   macro avg       0.97      0.97      0.97       828
weighted avg       0.97      0.97      0.97       828


Classification Report for RandomForestClassifier with CountVectorizer:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       285
           1       0.99      0.98      0.98       266
           2       0.98      0.97      0.98       277

    accuracy                           0.98       828
   macro avg       0.98      0.98      0.98       828
weighted avg       0.98      0.98      0.98       828


Classification Report for RandomForestClassifier with TfidfVectorizer:
              precisio

In [21]:
# Save the best model
model_path = os.path.join("saved_models", f"best_model_{best_model_name}.pkl")
joblib.dump(best_model, model_path)

print(f"\n✅ Best model saved: {model_path} (Accuracy: {best_score:.4f})")


✅ Best model saved: saved_models/best_model_RandomForestClassifier_CountVectorizer.pkl (Accuracy: 0.9771)
