In [2]:
import nltk

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\CHRISTIN
[nltk_data]     SANTHOSH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("dataset1.csv")

# Step 1: Preprocess Text Data
def preprocess_text(text):
    if isinstance(text, float):  # Handle NaN/float values
        text = ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    words = word_tokenize(text)  # Tokenization
    return " ".join(words)  # Reconstruct text

# Apply preprocessing to dataset
df["cleaned_review"] = df["review"].fillna("").apply(preprocess_text)

# Remove rows with empty cleaned reviews
df = df[df["cleaned_review"].str.strip() != ""]

print("Number of reviews:", len(df))  # Should be greater than 0
print("Sample reviews:", df["cleaned_review"].head(5))  # Check sample text

# Debugging checks
print("Dataset size after cleaning:", df.shape)
print("Sample cleaned reviews:\n", df["cleaned_review"].head())

# Strip any extra spaces that might be causing issues with mapping
df["sentiment"] = df["sentiment"].str.strip()

# Save preprocessed data
df.to_csv("preprocessed_dataset1.csv", index=False)
print("Preprocessing complete!")

# Step 2: Convert Sentiment Labels to Numeric
df["sentiment"] = df["sentiment"].map({"Positive": 1, "Negative": 0})

# Check the unique values again
print(df["sentiment"].unique())

# Drop missing sentiment values if any
df = df.dropna(subset=['sentiment'])

# Convert labels to integer
y = df["sentiment"].astype(int)
X = df["cleaned_review"]

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)  
X_test = vectorizer.transform(X_test)

# Debugging checks
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Length of y_train:", len(y_train))
print("Length of y_test:", len(y_test))

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can tune n_neighbors
knn_model.fit(X_train, y_train)

# Step 6: Evaluate Model
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\n🔹 Model Accuracy:", accuracy)
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


Number of reviews: 10673
Sample reviews: 0    too shity it is incredible that this fucking s...
1    w game but bad on steam version nonsteam bette...
2    why is this game so boring there are no bots i...
3    games peace is too slow and weapons are too in...
5    the game is great but standstill crouching wit...
Name: cleaned_review, dtype: object
Dataset size after cleaning: (10673, 4)
Sample cleaned reviews:
 0    too shity it is incredible that this fucking s...
1    w game but bad on steam version nonsteam bette...
2    why is this game so boring there are no bots i...
3    games peace is too slow and weapons are too in...
5    the game is great but standstill crouching wit...
Name: cleaned_review, dtype: object
Preprocessing complete!
[0 1]
Shape of X_train: (8538, 5000)
Shape of X_test: (2135, 5000)
Length of y_train: 8538
Length of y_test: 2135

🔹 Model Accuracy: 0.8730679156908665

🔹 Classification Report:
               precision    recall  f1-score   support

           0  