In [1]:
# ----------------------------
# Import Libraries
# ----------------------------
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# ----------------------------
# Load Dataset
# ----------------------------
df = pd.read_csv("dataset1.csv")

# ----------------------------
# Step 1: Preprocess Text Data
# ----------------------------
def preprocess_text(text):
    if isinstance(text, float):  # Handle NaN/float values
        text = ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    words = word_tokenize(text)  # Tokenization
    return " ".join(words)  # Reconstruct text

df["cleaned_review"] = df["review"].fillna("").apply(preprocess_text)

# Fix typo: 'claned_review' â†’ 'cleaned_review'
df = df[df["cleaned_review"].str.strip() != ""]

print("Number of reviews:", len(df))
print("Sample reviews:", df["cleaned_review"].head(5))
print("Dataset size after cleaning:", df.shape)

# ----------------------------
# Step 2: Prepare Labels
# ----------------------------
df["sentiment"] = df["sentiment"].str.strip()
df["sentiment"] = df["sentiment"].map({"Positive": 1, "Negative": 0})
df = df.dropna(subset=['sentiment'])
y = df["sentiment"].astype(int)

# ----------------------------
# Step 3: Train/Test Split
# ----------------------------
X = df["cleaned_review"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------
# Step 4: TF-IDF Vectorization
# ----------------------------
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Length of y_train:", len(y_train))
print("Length of y_test:", len(y_test))

# ----------------------------
# Step 5: Train Decision Tree Classifier
# ----------------------------
dt_model = DecisionTreeClassifier(max_depth=20, random_state=42)
dt_model.fit(X_train, y_train)

# ----------------------------
# Step 6: Evaluate Model
# ----------------------------
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nðŸ”¹ Model Accuracy:", accuracy)
print("\nðŸ”¹ Classification Report:\n", classification_report(y_test, y_pred))


Number of reviews: 10673
Sample reviews: 0    too shity it is incredible that this fucking s...
1    w game but bad on steam version nonsteam bette...
2    why is this game so boring there are no bots i...
3    games peace is too slow and weapons are too in...
5    the game is great but standstill crouching wit...
Name: cleaned_review, dtype: object
Dataset size after cleaning: (10673, 4)
Shape of X_train: (8538, 5000)
Shape of X_test: (2135, 5000)
Length of y_train: 8538
Length of y_test: 2135

ðŸ”¹ Model Accuracy: 0.8796252927400469

ðŸ”¹ Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.36      0.45       295
           1       0.90      0.96      0.93      1840

    accuracy                           0.88      2135
   macro avg       0.76      0.66      0.69      2135
weighted avg       0.86      0.88      0.87      2135

