In [5]:
# ============================
# Sentiment Analysis on CSV (IMDB format)
# - Columns expected: 'review', 'sentiment' (values: 'positive'/'negative')
# - Prints accuracy (train/test split)
# - Saves predictions for ALL rows (CSV + Excel)
# ============================
# Cell 1: Imports + NLTK
import pandas as pd
import numpy as np
import re, string, time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import LogisticRegression

# Optional (sirf agar evaluation chahiye ho to)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data (run once)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

print("✅ Libraries ready")

print("✅ Libraries ready------")



✅ Libraries ready
✅ Libraries ready------


In [6]:

# ----------------------------
# 1) Load CSV
# ----------------------------
# Change this path if needed
CSV_PATH = "IMDB Dataset.csv"   # e.g., "/content/IMDB Dataset.csv"
df = pd.read_csv(CSV_PATH)
print("✅ CSV loaded:", df.shape)
print(df.head())

# # Sanity check for expected columns
# if not {'review', 'sentiment'}.issubset(df.columns):
#     raise ValueError("CSV must have columns: 'review' and 'sentiment'")




✅ CSV loaded: (50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [12]:


# 2) Minimal cleaning
# ----------------------------
def preprocessing(tweet):
  if not isinstance(tweet, str):
    return ""

  tweet = tweet.lower()
  nopun = "".join([char for char in tweet  if not char in string.punctuation ])

  STOPWORDS = stopwords.words("english") + [
        'u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'  # Social media specific
    ]

  words = [word for word in nopun.split() if word not in STOPWORDS]
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in words]
  return " ".join(words)

df['review_clean'] = df['review'].astype(str).apply(preprocessing).apply(preprocessing)
df.head()


print("✅ Preprocessing done")
df.head()




✅ Preprocessing done


Unnamed: 0,review,sentiment,clean_tweets,label,review_clean
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode y...,1,one reviewer mentioned watching 1 oz episode y...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...,1,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1,petter matteis love time money visually stunni...


In [13]:
# ----------------------------
# 3) Map labels to 0/1
# ----------------------------


binary_mapping = {'positive': 1 , 'negative':0}
df = df[df['sentiment'].isin(binary_mapping.keys())].copy()
df['label']= df['sentiment'].map(binary_mapping)

df_train_data = df["clean_tweets"]
df_test_lable = df["label"]
print("Class counts:", df['label'].value_counts().to_dict())
print("✅ Labels ready")


Class counts: {1: 25000, 0: 25000}
✅ Labels ready


In [15]:

# ----------------------------
# 4) Train/Test split
# ----------------------------
X_train_text, X_val_text, y_train, y_val = train_test_split(
    df['review_clean'].values,
    df['label'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['label'].values
)



In [16]:
# ----------------------------
# 5) Vectorize (TF-IDF)
# ----------------------------
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95
)
X_train = vectorizer.fit_transform(X_train_text)
X_val   = vectorizer.transform(X_val_text)


In [17]:

# ----------------------------
# 6) Train Logistic Regression
# ----------------------------
# 'saga' is fast for large sparse data and supports predict_proba
lr = LogisticRegression(solver='saga', max_iter=2000, n_jobs=-1)
t0 = time.time()
lr.fit(X_train, y_train)
print(f"⏱ Train time: {time.time()-t0:.2f}s")


⏱ Train time: 2.27s


In [18]:

# ----------------------------
# 7) Evaluate
# ----------------------------
y_pred = lr.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"🎯 Validation Accuracy (LR): {acc:.4f}")

print("📊 Classification Report (LR):")
print(classification_report(y_val, y_pred, target_names=['negative','positive']))



🎯 Validation Accuracy (LR): 0.8980
📊 Classification Report (LR):
              precision    recall  f1-score   support

    negative       0.91      0.88      0.90      5000
    positive       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [20]:
# ----------------------------
# 8) Final predictions for ALL rows (optional but useful)
#    Fit on ALL data, then predict for the whole CSV
# ----------------------------
X_all = vectorizer.fit_transform(df['review_clean'].values)
lr.fit(X_all, df['label'].values)

pred_all = lr.predict(X_all)
df['PredLabel'] = pd.Series(pred_all).map({0:'negative', 1:'positive'})

# Confidence (may not be perfect calibration; still useful)
try:
    prob_all = lr.predict_proba(X_all).max(axis=1)
    df['Confidence'] = prob_all
except Exception:
    pass



In [None]:
# ----------------------------
# 9) Save outputs
# ----------------------------
out_cols = ['review', 'PredLabel']
if 'Confidence' in df.columns:
    out_cols.append('Confidence')

csv_out = "Sentiment_Predictions.csv"
xlsx_out = "Sentiment_Predictions.xlsx"

df[out_cols].to_csv(csv_out, index=False, encoding='utf-8-sig')
try:
    df[out_cols].to_excel(xlsx_out, index=False)
    print(f"💾 Saved:\n - {csv_out}\n - {xlsx_out}")
except Exception as e:
    print(f"💾 Saved:\n - {csv_out}\n - (Excel skipped: {e})")

# (Colab) auto-download
try:
    from google.colab import files
    files.download(csv_out)
except Exception:
    pass
