# Import Libraries

In [1]:
# Core data handling
import pandas as pd
import numpy as np

# Text preprocessing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer

# Model
from sklearn.linear_model import LogisticRegression

# Evaluation
from sklearn.metrics import classification_report, accuracy_score

# Saving / loading model
import joblib

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Load Dataset

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Test.csv to Test.csv
Saving Train.csv to Train.csv
Saving Valid.csv to Valid.csv


In [4]:
import pandas as pd

train_df = pd.read_csv('Train.csv')
valid_df = pd.read_csv('Valid.csv')
test_df  = pd.read_csv('Test.csv')

# quick look at the data
print(train_df.shape)
train_df.head()

(40000, 2)


Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


# Renaming Columns

In [8]:
train_df = train_df.rename(columns={'text':'review', 'label':'sentiment'})
valid_df = valid_df.rename(columns={'text':'review', 'label':'sentiment'})
test_df  = test_df.rename(columns={'text':'review', 'label':'sentiment'})

# Preprocessing

In [9]:
import re, nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # keep letters only
    text = " ".join([w for w in text.split() if w not in stop_words])
    return text

for df in [train_df, valid_df, test_df]:
    df['clean_review'] = df['review'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Vectorize text + train Logistic Regression

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['clean_review'])
y_train = train_df['sentiment']

X_valid = vectorizer.transform(valid_df['clean_review'])
y_valid = valid_df['sentiment']

# Train
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Validate
train_acc = accuracy_score(y_train, model.predict(X_train))
valid_acc = model.score(X_valid, y_valid)
print("Validation Accuracy:", valid_acc)
print("Train Accuracy:", train_acc)

Validation Accuracy: 0.884
Train Accuracy: 0.997925


# Evaluate on Test Set

In [11]:
X_test = vectorizer.transform(test_df['clean_review'])
y_test = test_df['sentiment']

test_acc = model.score(X_test, y_test)
print("Test Accuracy:", test_acc)

Test Accuracy: 0.8898


In [12]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      2495
    Positive       0.88      0.90      0.89      2505

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



#Predict New Reviews

In [13]:
new_reviews = [
    "This movie was fantastic! Loved it.",
    "I hate this film. Waste of time."
]
new_reviews_clean = [clean_text(r) for r in new_reviews]
new_X = vectorizer.transform(new_reviews_clean)
preds = model.predict(new_X)

for review, pred in zip(new_reviews, preds):
    print(f"{review} -> {'Positive' if pred==1 else 'Negative'}")

This movie was fantastic! Loved it. -> Positive
I hate this film. Waste of time. -> Negative
