# Sentiment Analysis with NLP: Tweets/Reviews

This notebook demonstrates data preprocessing, model training, and insights for sentiment analysis on short texts (e.g., tweets, reviews).

## 1. Setup
Install/import required libraries. If needed, uncomment `pip install` lines.

In [None]:
# If running on a fresh environment, uncomment the line below
# !pip install numpy pandas scikit-learn matplotlib nbformat
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support


## 2. Data: Synthetic Tweets/Reviews
We create a small labeled dataset with positive, negative, and neutral examples (with slang and emojis).

In [None]:
import random, numpy as np, pandas as pd

random.seed(7)
np.random.seed(7)

positive_phrases = [
    "Absolutely loved this!", "What a fantastic experience", "superb quality", "highly recommend",
    "this made my day", "brilliant work", "five stars", "chef's kiss", "awesome service",
    "worth every rupee", "top notch", "movie was amazing", "battery life is great",
    "performance is blazing fast", "UI is clean and smooth", "updates are timely",
    "camera is excellent", "support team was helpful", "value for money", "gonna buy again",
]
negative_phrases = [
    "Terrible experience", "really disappointed", "waste of money", "never again",
    "this ruined my day", "worst service ever", "one star", "bugs everywhere",
    "not worth it", "battery drains so fast", "performance is awful",
    "UI is laggy", "kept crashing", "support didn't respond",
    "overpriced junk", "quality is poor", "delivery was late", "sound is bad",
    "totally useless", "refund please",
]
neutral_phrases = [
    "It arrived yesterday", "Using it for a week now", "I watched the movie today",
    "setup took around 10 minutes", "package includes charger and cable",
    "UI changed after update", "price dropped last month", "service center is nearby",
    "received a message", "there was an announcement", "I went to the store",
    "battery shows 80 percent", "new features added", "some settings were hidden",
    "works as expected", "layout is different", "I saw the trailer", "download completed",
    "stock is available", "return window closes soon",
]

emojis_pos = ["😍", "🤩", "🔥", "✨", "👍", "😊"]
emojis_neg = ["😡", "🤮", "👎", "💀", "😤", "😭"]
emojis_neu = ["🤔", "🧐", "ℹ️", "📦", "🕒", "📣"]

extra_tokens = ["pls", "bro", "yaar", "lol", "fr", "ngl", "btw", "imo", "tbh", "idk", "sale", "offer"]

def synth_line(pool, emoji_pool, extra_tokens):
    import random
    base = random.choice(pool)
    em = random.choice(emoji_pool)
    noise = " " + " ".join(random.sample(extra_tokens, k=random.randint(0, 2))) if extra_tokens else ""
    return f"{base} {em}{noise}"

def make_samples(n, phrases, emojis):
    return [synth_line(phrases, emojis, extra_tokens) for _ in range(n)]

n_per_class = 60
texts = make_samples(n_per_class, positive_phrases, emojis_pos) +         make_samples(n_per_class, negative_phrases, emojis_neg) +         make_samples(n_per_class, neutral_phrases, emojis_neu)
labels = (["positive"] * n_per_class) + (["negative"] * n_per_class) + (["neutral"] * n_per_class)

import pandas as pd
df = pd.DataFrame({"text": texts, "label": labels}).sample(frac=1.0, random_state=42).reset_index(drop=True)
df.head()


## 3. Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'].values, df['label'].values,
    test_size=0.25, random_state=42, stratify=df['label'].values
)
len(X_train), len(X_test)


## 4. Pipeline: TF-IDF + Logistic Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', min_df=1, max_df=0.95)),
    ('clf', LogisticRegression(max_iter=1000, multi_class='auto'))
])
pipeline.fit(X_train, y_train)


## 5. Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np, pandas as pd

preds = pipeline.predict(X_test)
acc = accuracy_score(y_test, preds)
pr, rc, f1, support = precision_recall_fscore_support(
    y_test, preds, labels=['negative', 'neutral', 'positive'], zero_division=0
)
metrics_df = pd.DataFrame({
    'class': ['negative', 'neutral', 'positive'],
    'precision': pr, 'recall': rc, 'f1': f1, 'support': support
})
metrics_df.loc[len(metrics_df)] = ['overall_accuracy', acc, np.nan, np.nan, len(y_test)]
metrics_df.round(3)


In [None]:
print("Classification Report:\n")
print(classification_report(y_test, preds, zero_division=0))


### Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
cm = confusion_matrix(y_test, preds, labels=['negative', 'neutral', 'positive'])
plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion Matrix')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.xticks(ticks=[0,1,2], labels=['negative','neutral','positive'])
plt.yticks(ticks=[0,1,2], labels=['negative','neutral','positive'])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, str(cm[i, j]), ha='center', va='center')
plt.tight_layout()
plt.show()


## 6. Insights: Most-Informative Features

In [None]:
import numpy as np, pandas as pd
tfidf = pipeline.named_steps['tfidf']
clf = pipeline.named_steps['clf']
feature_names = np.array(tfidf.get_feature_names_out())
class_to_index = {c: i for i, c in enumerate(clf.classes_)}

def top_features_for_class(class_index, k=15):
    coefs = clf.coef_[class_index]
    top_idx = np.argsort(coefs)[-k:][::-1]
    return pd.DataFrame({'feature': feature_names[top_idx], 'weight': coefs[top_idx]})

top_pos = top_features_for_class(class_to_index['positive'], 15)
top_neg = top_features_for_class(class_to_index['negative'], 15)
top_neu = top_features_for_class(class_to_index['neutral'], 15)

print("Top Positive Features:"); display(top_pos)
print("\nTop Negative Features:"); display(top_neg)
print("\nTop Neutral Features:"); display(top_neu)


## 7. Try It: Predict Your Own Texts

In [None]:
samples = [
    "Loved the new update, super smooth!",
    "Battery drains in 2 hours, not happy",
    "The package arrived on Tuesday and I set it up"
]
print(list(zip(samples, pipeline.predict(samples))))
