In [1]:
# ----------------------------
# Step 1: Import Libraries
# ----------------------------
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# ----------------------------
# Step 2: Load Dataset
# ----------------------------
df = pd.read_csv("dataset1.csv")

# ----------------------------
# Step 3: Text Preprocessing
# ----------------------------
def preprocess_text(text):
    if isinstance(text, float):  # Handle NaN/float values
        text = ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    words = word_tokenize(text)  # Tokenization
    return " ".join(words)  # Reconstruct text

# Apply preprocessing
df["cleaned_review"] = df["review"].fillna("").apply(preprocess_text)

# Drop empty rows after cleaning
df = df[df["cleaned_review"].str.strip() != ""]

# Debug: Check dataset
print("✅ Number of reviews:", len(df))
print("✅ Sample reviews:\n", df["cleaned_review"].head())

# ----------------------------
# Step 4: Prepare Labels
# ----------------------------
df["sentiment"] = df["sentiment"].str.strip()
df["sentiment"] = df["sentiment"].map({"Positive": 1, "Negative": 0})
df = df.dropna(subset=['sentiment'])

# Final inputs and outputs
X = df["cleaned_review"]
y = df["sentiment"].astype(int)

# ----------------------------
# Step 5: Split Data
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------
# Step 6: TF-IDF Vectorization
# ----------------------------
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("✅ Shape of X_train:", X_train_tfidf.shape)
print("✅ Shape of X_test:", X_test_tfidf.shape)

# ----------------------------
# Step 7: Train Logistic Regression
# ----------------------------
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train_tfidf, y_train)

# ----------------------------
# Step 8: Evaluate Model
# ----------------------------
y_pred = log_reg_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("\n🔹 Logistic Regression Accuracy:", accuracy)
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


✅ Number of reviews: 10673
✅ Sample reviews:
 0    too shity it is incredible that this fucking s...
1    w game but bad on steam version nonsteam bette...
2    why is this game so boring there are no bots i...
3    games peace is too slow and weapons are too in...
5    the game is great but standstill crouching wit...
Name: cleaned_review, dtype: object
✅ Shape of X_train: (8538, 5000)
✅ Shape of X_test: (2135, 5000)

🔹 Logistic Regression Accuracy: 0.8969555035128806

🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.33      0.47       295
           1       0.90      0.99      0.94      1840

    accuracy                           0.90      2135
   macro avg       0.86      0.66      0.71      2135
weighted avg       0.89      0.90      0.88      2135

