In [None]:
!pip install pandas numpy nltk scikit-learn streamlit


Collecting streamlit
  Downloading streamlit-1.45.0-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [None]:
import pandas as pd

# Read the CSV with mixed types and skip broken lines
df = pd.read_csv("Reviews.csv", on_bad_lines='skip', quoting=3, encoding='utf-8', low_memory=False)

# Convert the 'Score' column to numeric (force errors to NaN), then drop those rows
df['Score'] = pd.to_numeric(df['Score'], errors='coerce')
df = df.dropna(subset=['Score'])
df['Score'] = df['Score'].astype(int)

# Create binary sentiment: 1 if Score > 3 (positive), else 0 (negative/neutral)
df['sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)

# (Optional) Keep only relevant columns
df = df[['Text', 'sentiment']]

# Show first few rows
df.head()


Unnamed: 0,Text,sentiment
0,I have bought several of the Vitality canned d...,1
1,"""Product arrived labeled as Jumbo Salted Peanu...",0
2,If you are looking for the secret ingredient i...,0
3,Right now I'm mostly just sprouting this so my...,1
4,This is a very healthy dog food. Good for thei...,1


In [None]:
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Clean text
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

df['cleaned_review'] = df['Text'].apply(clean_text)

# Split data
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Evaluate
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9179663966798313
              precision    recall  f1-score   support

           0       0.84      0.66      0.74      5227
           1       0.93      0.97      0.95     24651

    accuracy                           0.92     29878
   macro avg       0.89      0.81      0.84     29878
weighted avg       0.91      0.92      0.91     29878



In [None]:
import joblib

joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# Download in Google Colab
from google.colab import files
files.download('sentiment_model.pkl')
files.download('vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>