# Twitter Sentiment Analysis 

In [1]:
# Twitter Sentiment Analysis - Single Cell
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re

# 1) Load dataset (adjust path if needed)
# Expected columns: 'tweet_id', 'entity', 'sentiment', 'tweet_content'
df = pd.read_csv('twitter_training.csv', header=None, names=['tweet_id', 'entity', 'sentiment', 'tweet_content'])
print(f"Dataset loaded: {len(df)} rows")
print(f"Sentiment distribution:\n{df['sentiment'].value_counts()}\n")

# 2) Clean and filter
df = df.dropna(subset=['tweet_content', 'sentiment'])
df = df[df['sentiment'].isin(['Positive', 'Negative', 'Neutral', 'Irrelevant'])].copy()

# Simple text preprocessing
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r'#', '', text)  # remove hashtag symbol
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # keep only letters
    return text.strip()

df['cleaned'] = df['tweet_content'].apply(clean_text)
df = df[df['cleaned'].str.len() > 5]  # filter very short tweets

print(f"After cleaning: {len(df)} samples")
print(f"Sentiment counts:\n{df['sentiment'].value_counts()}\n")

# 3) Prepare features and target
X = df['cleaned']
y = df['sentiment']

# 4) Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# 5) Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 6) Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# 7) Evaluate
y_pred = model.predict(X_test_vec)
acc = accuracy_score(y_test, y_pred)

print("\n")
print(f"Model Accuracy: {acc*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 8) Prediction helper
def predict_sentiment(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)[0]
    proba = model.predict_proba(vec)[0]
    return pred, proba

# 9) Quick test examples
print("\n")
print("TEST EXAMPLES")

test1 = "I love this product! It's amazing and works perfectly."
pred1, prob1 = predict_sentiment(test1)
print(f"\nText: {test1}")
print(f"Sentiment: {pred1} (Confidence: {max(prob1)*100:.1f}%)")

test2 = "This is terrible. Worst experience ever!"
pred2, prob2 = predict_sentiment(test2)
print(f"\nText: {test2}")
print(f"Sentiment: {pred2} (Confidence: {max(prob2)*100:.1f}%)")

test3 = "It's okay, nothing special."
pred3, prob3 = predict_sentiment(test3)
print(f"\nText: {test3}")
print(f"Sentiment: {pred3} (Confidence: {max(prob3)*100:.1f}%)")

print("\n")
print("Model ready. Use predict_sentiment(text) for custom input.")

Dataset loaded: 74682 rows
Sentiment distribution:
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

After cleaning: 71440 samples
Sentiment counts:
sentiment
Negative      21610
Positive      19787
Neutral       17510
Irrelevant    12533
Name: count, dtype: int64

Train: 57152, Test: 14288


Model Accuracy: 63.46%

Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.71      0.36      0.48      2507
    Negative       0.63      0.79      0.70      4322
     Neutral       0.66      0.53      0.59      3502
    Positive       0.61      0.73      0.66      3957

    accuracy                           0.63     14288
   macro avg       0.65      0.60      0.61     14288
weighted avg       0.64      0.63      0.62     14288


Confusion Matrix:
[[ 905  653  266  683]
 [ 110 3412  333  467]
 [ 145  800 1864  693]
 [ 114  594  363 2886]]


TEST EXAMPLES

Text: I love this product! I

##  Test Own Text

In [3]:
# Enter your text here and run this cell
my_text = "This is absolutely fantastic! I'm so happy with the results."

# Get prediction
sentiment, probabilities = predict_sentiment(my_text)

# Display result
print("SENTIMENT ANALYSIS")
print(f"Input Text:\n  {my_text}")
print("\n")
print(f"Predicted Sentiment: {sentiment}")
print(f"Confidence: {max(probabilities)*100:.2f}%")
print("\nAll Class Probabilities:")
for label, prob in zip(model.classes_, probabilities):
    print(f"  {label}: {prob*100:.2f}%")

SENTIMENT ANALYSIS
Input Text:
  This is absolutely fantastic! I'm so happy with the results.


Predicted Sentiment: Positive
Confidence: 71.52%

All Class Probabilities:
  Irrelevant: 15.27%
  Negative: 4.06%
  Neutral: 9.16%
  Positive: 71.52%
