In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import joblib

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to C:\Users\Devashish
[nltk_data]     Uniyal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Devashish
[nltk_data]     Uniyal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Devashish
[nltk_data]     Uniyal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="latin-1", header=None)
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
print(df[['sentiment', 'text']].head(10))

  df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="latin-1", header=None)


   sentiment                                               text
0  sentiment                                               text
1          0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
2          0  is upset that he can't update his Facebook by ...
3          0  @Kenichan I dived many times for the ball. Man...
4          0    my whole body feels itchy and like its on fire 
5          0  @nationwideclass no, it's not behaving at all....
6          0                      @Kwesidei not the whole crew 
7          0                                        Need a hug 
8          0  @LOLTrish hey  long time no see! Yes.. Rains a...
9          0               @Tatiana_K nope they didn't have it 


In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\d+", "", text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

In [4]:
df["clean_text"] = df["text"].apply(clean_text)
df = df[df["clean_text"].str.strip() != ""]  # Remove empty entries after cleaning
print(df[["text", "clean_text"]].head())

                                                text  \
0                                               text   
1  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
2  is upset that he can't update his Facebook by ...   
3  @Kenichan I dived many times for the ball. Man...   
4    my whole body feels itchy and like its on fire    

                                          clean_text  
0                                               text  
1  awww thats bummer shoulda got david carr third...  
2  upset cant update facebook texting might cry r...  
3       dived many time ball managed save rest bound  
4                    whole body feel itchy like fire  


In [5]:
# Convert sentiment: 0 (negative) => 0, 4 (positive) => 1
df = df[df['sentiment'].isin([0, 4])]
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})

X = df["clean_text"]
y = df["sentiment"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")

Train samples: 1168312, Test samples: 292079


In [7]:
vectorizer = TfidfVectorizer(
    max_features=25000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    sublinear_tf=True
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("TF-IDF completed.")

TF-IDF completed.


In [8]:
selector = SelectKBest(chi2, k=15000)  # Reduced features to avoid overfitting
X_train_selected = selector.fit_transform(X_train_tfidf, y_train)
X_test_selected = selector.transform(X_test_tfidf)
print("Feature selection done.")

Feature selection done.


In [9]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
weights_dict = dict(zip(np.unique(y_train), class_weights))
print(f"Class Weights: {weights_dict}")

Class Weights: {0: 1.0975548349034447, 1: 0.918371646244643}


In [10]:
from sklearn.svm import LinearSVC

model = LinearSVC(
    class_weight=weights_dict, 
    C=0.25
)
model.fit(X_train_selected, y_train)
print("SVM training complete.")



SVM training complete.


In [11]:
y_train_pred = model.predict(X_train_selected)
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print("\nTrain Classification Report:")
print(classification_report(y_train, y_train_pred))
print("\nTrain Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))

Train Accuracy: 0.7985

Train Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.77      0.78    532234
           1       0.81      0.82      0.82    636078

    accuracy                           0.80   1168312
   macro avg       0.80      0.80      0.80   1168312
weighted avg       0.80      0.80      0.80   1168312


Train Confusion Matrix:
[[409956 122278]
 [113191 522887]]


In [12]:
y_test_pred = model.predict(X_test_selected)
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))
print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Test Accuracy: 0.7905

Test Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.76      0.77    133059
           1       0.80      0.81      0.81    159020

    accuracy                           0.79    292079
   macro avg       0.79      0.79      0.79    292079
weighted avg       0.79      0.79      0.79    292079


Test Confusion Matrix:
[[101344  31715]
 [ 29466 129554]]


In [13]:
import joblib

# Saving our Model
joblib.dump(model, "svc_twitter_sentiment.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(selector, "feature_selector.pkl")
print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.
