In [9]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [10]:
import pandas as pd
df = pd.read_csv('reviews_dataset.csv')

In [11]:
# Text Preprocessing
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['ReviewContent'] = df['ReviewContent'].apply(remove_punctuation)

In [12]:
# Text Preprocessing
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['ReviewContent'] = df['ReviewContent'].apply(remove_punctuation)

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

df['ReviewContent'] = df['ReviewContent'].apply(remove_stopwords)

from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

df['ReviewContent'] = df['ReviewContent'].apply(lemmatize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Sentiment_encoded'] = label_encoder.fit_transform(df['Sentiment'])

In [14]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
import pickle
import numpy as np

# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Best parameters obtained from GridSearchCV (including class_weight)
best_params_rf = {
    'bootstrap': True,
    'class_weight': class_weights_dict,
    'max_depth': 40,
    'max_features': 'log2',
    'min_samples_leaf': 1,
    'min_samples_split': 5,
    'n_estimators': 500
}

# Instantiate the RandomForestClassifier with the best parameters
best_rf = RandomForestClassifier(
    bootstrap=best_params_rf['bootstrap'],
    class_weight=best_params_rf['class_weight'],
    max_depth=best_params_rf['max_depth'],
    max_features=best_params_rf['max_features'],
    min_samples_leaf=best_params_rf['min_samples_leaf'],
    min_samples_split=best_params_rf['min_samples_split'],
    n_estimators=best_params_rf['n_estimators'],
    random_state=42
)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation
cross_val_scores = cross_val_score(best_rf, X_train_tfidf, y_train, cv=skf, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean()}")

# Fit the classifier to the training data
best_rf.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred_rf = best_rf.predict(X_test_tfidf)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy (Random Forest): {accuracy_rf:.2f}")

print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))


Cross-Validation Accuracy Scores: [0.71551724 0.71551724 0.71982759 0.72413793 0.72413793 0.73275862
 0.72844828 0.72294372 0.74458874 0.71861472]
Mean Cross-Validation Accuracy: 0.7246492013733393
Accuracy (Random Forest): 0.73
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.38      0.05      0.09       125
           1       0.00      0.00      0.00        53
           2       0.68      0.31      0.42       117
           3       0.74      0.97      0.84       698

    accuracy                           0.73       993
   macro avg       0.45      0.33      0.34       993
weighted avg       0.65      0.73      0.65       993



In [5]:
# Number of missing values in 'ReviewContent' column
num_missing_values = df['ReviewContent'].isna().sum()
print(f"Number of missing values in 'ReviewContent' column: {num_missing_values}")


Number of missing values in 'ReviewContent' column: 3
