<a href="https://www.kaggle.com/code/faryalrifaz3374/sentiment-analysis-for-tweets?scriptVersionId=264604330" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Project : Sentiment Analysis on Tweets**  

## **Auther: Faryal Rifaz** 

##  **Objective**  
The objective of this project is to build a sentiment analysis tool using Python and NLTK to classify tweets as **positive, negative, or neutral**.  

---

##  Step 1: Import Libraries  

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import string
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import emoji
nltk.download('punkt')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from keras.layers import LSTM, Dense, SimpleRNN, Embedding, Flatten, Dropout
from keras.activations import softmax
from sklearn.model_selection import train_test_split
# ignore warnings   
import warnings
warnings.filterwarnings('ignore')

## Step 2: Import Data 

In [None]:
import pandas as pd

# Training dataset
train = pd.read_csv(
    "/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv",
    header=None,
    names=["ID", "Topic", "Sentiment", "Text"]
)

# Validation dataset
valid = pd.read_csv(
    "/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv",
    header=None,
    names=["ID", "Topic", "Sentiment", "Text"]
)

# Show top rows 

train.head()


In [None]:
train.info()

In [None]:
valid.info()

In [None]:
train.describe()

In [None]:
valid.describe()

## Step 3: Preprocessing

In this step, we clean the raw tweet texts to make them suitable for analysis.  
The preprocessing pipeline includes:
1- Converting text to lowercase  
2- Removing URLs, mentions, hashtags, punctuation, and special characters  
3- Tokenization (splitting sentences into words)  
4- Stopword removal (removing common words like "the", "is", "at")  
5- Lemmatization (converting words to their base form, e.g., "running" → "run")  

This ensures that the machine learning model focuses only on meaningful information.


In [None]:
import re
import html
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download resources 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# stopwords & lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)                 # decode HTML
    text = text.lower()                        # lowercase
    text = re.sub(r'http\S+|www\.\S+', ' ', text)   # remove urls
    text = re.sub(r'@\w+', ' ', text)          # remove mentions
    text = re.sub(r'#', ' ', text)             # remove hash only
    text = re.sub(r'[^a-z0-9\s]', ' ', text)   # keep alnum + spaces
    tokens = nltk.word_tokenize(text)          # tokenize
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]  # lemmatize
    return " ".join(tokens)

# apply on both datasets
train['clean_text'] = train['Text'].apply(clean_text)
valid['clean_text'] = valid['Text'].apply(clean_text)

# check sample
print(train[['Text','clean_text']].head(10))


## Step 4: Encode labels 

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

#  Label encoding, fit on combined labels to ensure consistent mapping
le = LabelEncoder()
le.fit(pd.concat([train['Sentiment'], valid['Sentiment']], axis=0))
train['label_enc'] = le.transform(train['Sentiment'])
valid['label_enc'] = le.transform(valid['Sentiment'])
print("Label mapping (index -> label):", dict(enumerate(le.classes_)))

# Show class distribution
print("\nTrain class counts:\n", train['Sentiment'].value_counts())
print("\nValid class counts:\n", valid['Sentiment'].value_counts())

## Step 5: TF-IDF

In [None]:
# TF-IDF vectorization
tfv = TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words='english')
X_train = tfv.fit_transform(train['clean_text'])
X_valid = tfv.transform(valid['clean_text'])
y_train = train['label_enc']
y_valid = valid['label_enc']

print("\nTF-IDF matrix shapes, X_train, X_valid:", X_train.shape, X_valid.shape)

## Step 6: Compare Multiple Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Linear SVM": LinearSVC(random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    acc = accuracy_score(y_valid, preds)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")

# Convert results to DataFrame for easy plotting
res_df = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])



## Step 7:  Visualization

In [None]:

plt.figure(figsize=(8,5))
sns.barplot(x="Model", y="Accuracy", data=res_df, palette="viridis")
plt.title("Model Comparison on Validation Set")
plt.ylabel("Accuracy")
plt.xticks(rotation=30)
plt.show()


## Step 8: Evaluation

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, ConfusionMatrixDisplay
from sklearn.preprocessing import label_binarize

# Binarize labels for ROC
y_valid_bin = label_binarize(y_valid, classes=range(len(le.classes_)))
n_classes = y_valid_bin.shape[1]

plt.figure(figsize=(12, 8))

for idx, (name, model) in enumerate(models.items(), 1):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    
    # Accuracy
    acc = accuracy_score(y_valid, y_pred)
    print(f"\n{name} Accuracy: {acc:.4f}")
    
    # Classification report
    print(classification_report(y_valid, y_pred, target_names=le.classes_))
    
    # Confusion matrix
    print(f"Confusion Matrix for {name}:")
    disp = ConfusionMatrixDisplay.from_estimator(model, X_valid, y_valid,
                                                 display_labels=le.classes_,
                                                 cmap='Blues',
                                                 xticks_rotation=30)
    
    # ROC curve for multiclass
    y_score = model.predict_proba(X_valid) if hasattr(model, "predict_proba") else None
    if y_score is not None:
        plt.figure(figsize=(7,5))
        for i in range(n_classes):
            fpr, tpr, _ = roc_curve(y_valid_bin[:, i], y_score[:, i])
            plt.plot(fpr, tpr, label=f"{le.classes_[i]}")
        plt.plot([0,1], [0,1], 'k--')
        plt.title(f"{name} ROC Curve")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.legend()
        plt.show()


## Step 9: Final Model 

In [None]:
# Random Forest chosen as final model
print("Final Model Selected: Random Forest")

# Retrain on full dataset
X_full = tfv.fit_transform(pd.concat([train['clean_text'], valid['clean_text']], axis=0))
y_full = pd.concat([train['label_enc'], valid['label_enc']], axis=0)

final_model = RandomForestClassifier(n_estimators=200, random_state=42)
final_model.fit(X_full, y_full)

# Save final model and preprocessing objects
import joblib
joblib.dump(final_model, "final_random_forest.pkl")
joblib.dump(tfv, "final_tfidf_vectorizer.pkl")
joblib.dump(le, "final_label_encoder.pkl")

print("\nFinal model and preprocessing objects saved successfully")


## Step 10: Conclusion

The Random Forest classifier showed high performance for tweet sentiment analysis, achieving the best accuracy among applied models. 

---