In [25]:
# Basic libraries
import pandas as pd
import numpy as np

# Text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# For model training later
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:

df = pd.read_csv('Processed_Reviews.csv')  

# Check column names and first few rows
print(df.columns)
print(df.head())


Index(['Review', 'lowercased', 'urls_removed', 'html_removed',
       'emojis_removed', 'slangs_replaced', 'contractions_replaced',
       'punctuations_removed', 'numbers_removed', 'spelling_corrected',
       'stopwords_removed', 'stemmed_words', 'lemmatized', 'tokenized',
       'Label'],
      dtype='object')
                                              Review  \
0  The product arrived on time. Packaging was gre...   
1           THIS PRODUCT IS JUST AMAZING! I LOVE IT.   
2  I bought this phone for $799, and it has a 120...   
3  Wow!!! This product is awesome... but a bit ex...   
4                The laptop works perfectly fine.      

                                          lowercased  \
0  the product arrived on time. packaging was gre...   
1           this product is just amazing! i love it.   
2  i bought this phone for $799, and it has a 120...   
3  wow!!! this product is awesome... but a bit ex...   
4                the laptop works perfectly fine.      

           

In [29]:
# Set up lemmatizer and stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define clean+lemmatize function
def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    tokens = nltk.word_tokenize(text.lower())  # lowercase + tokenize
    words = [word for word in tokens if word.isalpha()]  # remove punctuation/numbers
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # remove stopwords & lemmatize
    return ' '.join(words)

# Apply it
df['lemmatized'] = df['Review'].apply(lemmatize_text)

# Check if it worked
print(df[['Review', 'lemmatized']].head())


                                              Review  \
0  The product arrived on time. Packaging was gre...   
1           THIS PRODUCT IS JUST AMAZING! I LOVE IT.   
2  I bought this phone for $799, and it has a 120...   
3  Wow!!! This product is awesome... but a bit ex...   
4                The laptop works perfectly fine.      

                                          lemmatized  
0  product arrived time packaging great quality a...  
1                               product amazing love  
2                 bought phone display totally worth  
3                  wow product awesome bit expensive  
4                         laptop work perfectly fine  


In [31]:
# Drop empty rows
df['lemmatized'] = df['lemmatized'].astype(str).str.strip()
df = df[df['lemmatized'].str.len() > 0]

print("After cleaning, rows left:", len(df))


After cleaning, rows left: 13


In [33]:
# Initialize TF-IDF
tfidf_vect = TfidfVectorizer()

# Fit and transform
X = tfidf_vect.fit_transform(df['lemmatized'])

# Define labels (target)
y = df['Label']


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vect = TfidfVectorizer()

# Apply TF-IDF vectorization to lemmatized reviews
X = tfidf_vect.fit_transform(df['lemmatized'])

# Check the shape of the resulting matrix
print(f"Shape of X (TF-IDF Features): {X.shape}")


Shape of X (TF-IDF Features): (13, 53)


In [37]:
from sklearn.model_selection import train_test_split

# Define the target variable 'y'
y = df['Label']

# Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the split shapes
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")


Training set size: 10, Test set size: 3


In [39]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model on the training set
train_score = model.score(X_train, y_train)
print(f"Training accuracy: {train_score:.4f}")


Training accuracy: 0.8000


In [41]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy:.4f}")

# Detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


Test accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

