In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay


In [33]:
df=pd.read_csv("twitter_training.csv")
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [34]:
df.shape

(74681, 4)

In [35]:
# Renaming columns
df.columns = ["tweet_id", "entity", "sentiment", "content"]

# Check updated dataframe
df.head()


Unnamed: 0,tweet_id,entity,sentiment,content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [36]:
print(df.isnull().sum())

tweet_id       0
entity         0
sentiment      0
content      686
dtype: int64


In [37]:
df=df.dropna()
df = df.reset_index(drop=True)

print(df.isnull().sum())

#Now we need to clean the dataset in order to make our model more efficient, like we have emojis and hashs and links in the content , this adds extra confusion to our model thus we will remove these extra fillers

In [38]:
import re

def clean_text(text):
    text = text.lower()                                    # lowercase
    text = re.sub(r"http\S+|www\S+", "", text)             # remove URLs
    text = re.sub(r"@\w+", "", text)                       # remove mentions
    text = re.sub(r"#\w+", "", text)                       # remove hashtags
    text = re.sub(r"[^a-z\s]", "", text)                   # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()               # remove extra spaces
    return text

df["clean_content"] = df["content"].apply(clean_text)
df.head()


Unnamed: 0,tweet_id,entity,sentiment,content,clean_content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands and i will murder yo...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...


In [39]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["sentiment_label"] = le.fit_transform(df["sentiment"])

print(le.classes_)  # To see which label corresponds to which number
df.head()


['Irrelevant' 'Negative' 'Neutral' 'Positive']


Unnamed: 0,tweet_id,entity,sentiment,content,clean_content,sentiment_label
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...,3
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all,3
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...,3
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands and i will murder yo...,3
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...,3


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 1. Split data
X = df["clean_content"]
y = df["sentiment_label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2. Feature extraction
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))  # unigrams + bigrams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 3. Train model (Logistic Regression example)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# 4. Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.66      0.48      0.56      2575
           1       0.71      0.77      0.74      4472
           2       0.63      0.63      0.63      3621
           3       0.68      0.73      0.70      4131

    accuracy                           0.68     14799
   macro avg       0.67      0.65      0.66     14799
weighted avg       0.67      0.68      0.67     14799

Confusion Matrix:
 [[1244  437  390  504]
 [ 197 3450  457  368]
 [ 245  553 2291  532]
 [ 202  412  511 3006]]


In [41]:
pip install nltk


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
import re
import string
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords + wordnet if not already
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove URLs, mentions, hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+|#\w+", '', text)
    
    # Remove punctuation & numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)

# Apply to dataset
df['clean_text'] = df.iloc[:, 3].astype(str).apply(clean_text)

df[['clean_text']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ashish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ashish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ashish\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,clean_text
0,coming border kill
1,im getting borderland kill
2,im coming borderland murder
3,im getting borderland murder
4,im getting borderland murder


In [43]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df.iloc[:, 2]   # Assuming "Positive/Negative" column is target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

clf = LogisticRegression(max_iter=1000, C=2, solver='liblinear')
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




              precision    recall  f1-score   support

  Irrelevant       0.82      0.64      0.72      2575
    Negative       0.80      0.84      0.82      4472
     Neutral       0.72      0.75      0.73      3621
    Positive       0.77      0.81      0.79      4131

    accuracy                           0.77     14799
   macro avg       0.78      0.76      0.76     14799
weighted avg       0.78      0.77      0.77     14799

Confusion Matrix:
 [[1638  283  294  360]
 [  93 3760  370  249]
 [ 142  394 2711  374]
 [ 124  274  396 3337]]


In [58]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

# Initialize SVM
svm_clf = LinearSVC(C=1.0, max_iter=5000)

# Train
svm_clf.fit(X_train_tfidf, y_train)

# Predict
y_pred_svm = svm_clf.predict(X_test_tfidf)

# Evaluate
print(classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


              precision    recall  f1-score   support

  Irrelevant       0.86      0.75      0.80      2575
    Negative       0.87      0.87      0.87      4472
     Neutral       0.83      0.80      0.81      3621
    Positive       0.78      0.87      0.82      4131

    accuracy                           0.83     14799
   macro avg       0.83      0.82      0.82     14799
weighted avg       0.83      0.83      0.83     14799

Confusion Matrix:
 [[1928  159  168  320]
 [  95 3876  188  313]
 [ 113  249 2880  379]
 [ 116  195  237 3583]]


In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Split into train/validation
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train_tfidf, y_train, test_size=0.2, random_state=42
)

# Try a few candidate C values
for C in [0.01, 0.1, 1, 10]:
    model = LogisticRegression(C=C, max_iter=1000, solver="liblinear")
    model.fit(X_subtrain, y_subtrain)
    preds = model.predict(X_val)
    print(f"C={C}, F1={f1_score(y_val, preds, average='weighted'):.3f}")





C=0.01, F1=0.413




C=0.1, F1=0.568




C=1, F1=0.735




C=10, F1=0.821


In [60]:
# Train final model with best C
final_model = LogisticRegression(C=10, max_iter=1000, solver="liblinear")
final_model.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = final_model.predict(X_test_tfidf)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))





              precision    recall  f1-score   support

  Irrelevant       0.85      0.74      0.79      2575
    Negative       0.86      0.86      0.86      4472
     Neutral       0.82      0.79      0.80      3621
    Positive       0.78      0.87      0.82      4131

    accuracy                           0.82     14799
   macro avg       0.83      0.81      0.82     14799
weighted avg       0.83      0.82      0.82     14799

[[1909  165  175  326]
 [  98 3857  198  319]
 [ 116  271 2847  387]
 [ 115  195  241 3580]]


In [61]:
svc_model = LinearSVC(C=1, class_weight={'Positive': 2})
svc_model.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,{'Positive': 2}
,verbose,0


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Try bigrams (1,2) instead of just single words
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=50000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [51]:
from sklearn.svm import LinearSVC

# Linear Support Vector Classifier
svc_model = LinearSVC(C=1, max_iter=2000)
svc_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred_svc = svc_model.predict(X_test_tfidf)


In [52]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred_svc))
print(confusion_matrix(y_test, y_pred_svc))


              precision    recall  f1-score   support

  Irrelevant       0.91      0.82      0.86      2575
    Negative       0.90      0.90      0.90      4472
     Neutral       0.89      0.86      0.87      3621
    Positive       0.83      0.91      0.87      4131

    accuracy                           0.88     14799
   macro avg       0.88      0.87      0.88     14799
weighted avg       0.88      0.88      0.88     14799

[[2122  114  101  238]
 [  58 4032  128  254]
 [  77  164 3106  274]
 [  78  160  152 3741]]


In [63]:

import joblib



In [64]:
# Save model
joblib.dump(svc_model, "sentiment_model.pkl")

# Save vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [65]:
# Load model
loaded_model = joblib.load("sentiment_model.pkl")

# Load vectorizer
loaded_vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Example usage
sample_text = ["I love this product, it's amazing!"]
sample_vector = loaded_vectorizer.transform(sample_text)
prediction = loaded_model.predict(sample_vector)

print("Prediction:", prediction[0])


ValueError: X has 50000 features, but LinearSVC is expecting 20000 features as input.