In [None]:
# Install (Colab usually has these, but this makes sure)
!pip -q install nltk scikit-learn

# Imports
import re
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

nltk.download('twitter_samples')
nltk.download('stopwords')

from nltk.corpus import twitter_samples

pos = twitter_samples.strings('positive_tweets.json')
neg = twitter_samples.strings('negative_tweets.json')

import pandas as pd
df = pd.DataFrame({
    'text': pos + neg,
    'label': [1]*len(pos) + [0]*len(neg)  # 1=positive, 0=negative
})

print(df.shape)
df.head()

import re

def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r'http\S+|www\.\S+', ' ', s)   # links
    s = re.sub(r'@\w+', ' ', s)               # @mentions
    s = re.sub(r'#', ' ', s)                  # #hashtags -> remove '#'
    s = re.sub(r'[^a-z\s]', ' ', s)           # keep letters + spaces
    s = re.sub(r'\s+', ' ', s).strip()        # collapse spaces
    return s

df['clean'] = df['text'].apply(clean_text)
df[['text','clean','label']].head()


X_train, X_test, y_train, y_test = train_test_split(
    df['clean'], df['label'],
    test_size=0.2, random_state=42, stratify=df['label']
)
len(X_train), len(X_test)

vectorizer = TfidfVectorizer(
    min_df=2,              # ignore very rare words
    stop_words='english',  # remove common stopwords
    max_features=10000,    # cap feature size
    ngram_range=(1,2)      # unigrams + bigrams often help
)

Xtr = vectorizer.fit_transform(X_train)
Xte = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(Xtr, y_train)

pred = model.predict(Xte)
print("Accuracy:", round(accuracy_score(y_test, pred)*100, 2), "%")
print(classification_report(y_test, pred, target_names=['negative','positive']))


def predict_sentiment(text: str):
    c = clean_text(text)
    v = vectorizer.transform([c])
    p = model.predict(v)[0]
    proba = model.predict_proba(v)[0][p]
    label = 'positive' if p == 1 else 'negative'
    return f"{label} (confidence {proba:.2f})"

print(predict_sentiment("I absolutely love this phone!"))
print(predict_sentiment("This is the worst day ever..."))

import joblib
joblib.dump(model, 'sentiment_model.joblib')
joblib.dump(vectorizer, 'vectorizer.joblib')

# In Colab, uncomment to download:
# from google.colab import files
# files.download('sentiment_model.joblib')
# files.download('vectorizer.joblib')



[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(10000, 2)
Accuracy: 75.25 %
              precision    recall  f1-score   support

    negative       0.74      0.79      0.76      1000
    positive       0.77      0.72      0.74      1000

    accuracy                           0.75      2000
   macro avg       0.75      0.75      0.75      2000
weighted avg       0.75      0.75      0.75      2000

positive (confidence 0.58)
negative (confidence 0.72)


['vectorizer.joblib']