In [12]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


In [13]:
import pandas as pd
df = pd.read_csv('reviews_dataset.csv')

In [14]:
# Text Preprocessing
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['ReviewContent'] = df['ReviewContent'].apply(remove_punctuation)

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

df['ReviewContent'] = df['ReviewContent'].apply(remove_stopwords)

from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

df['ReviewContent'] = df['ReviewContent'].apply(lemmatize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Sentiment_encoded'] = label_encoder.fit_transform(df['Sentiment'])

In [16]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
import pickle
import numpy as np

# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Best parameters obtained from GridSearchCV (including class_weight)
best_params_rf = {
    'bootstrap': True,
    'class_weight': class_weights_dict,
    'max_depth': 40,
    'max_features': 'log2',
    'min_samples_leaf': 1,
    'min_samples_split': 5,
    'n_estimators': 500
}


In [17]:
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

# Define the base models
base_models = [
    ('rf', RandomForestClassifier(
        bootstrap=best_params_rf['bootstrap'],
        class_weight=best_params_rf['class_weight'],
        max_depth=best_params_rf['max_depth'],
        max_features=best_params_rf['max_features'],
        min_samples_leaf=best_params_rf['min_samples_leaf'],
        min_samples_split=best_params_rf['min_samples_split'],
        n_estimators=best_params_rf['n_estimators'],
        random_state=42
    )),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
]

# Define the meta-model
meta_model = LogisticRegression(random_state=42)

# Create the stacking classifier
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=10  # You can specify the number of cross-validation folds here
)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation
cross_val_scores_stacking = cross_val_score(stacking_model, X_train_tfidf, y_train, cv=skf, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores (Stacking): {cross_val_scores_stacking}")
print(f"Mean Cross-Validation Accuracy (Stacking): {cross_val_scores_stacking.mean()}")

# Fit the stacking model to the training data
stacking_model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred_stacking = stacking_model.predict(X_test_tfidf)

# Evaluate model performance
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f"Accuracy (Stacking): {accuracy_stacking:.2f}")

print("Classification Report (Stacking):")
print(classification_report(y_test, y_pred_stacking))

Cross-Validation Accuracy Scores (Stacking): [0.71551724 0.71551724 0.70689655 0.70258621 0.70689655 0.72413793
 0.71551724 0.70995671 0.70995671 0.72727273]
Mean Cross-Validation Accuracy (Stacking): 0.7134255112703389
Accuracy (Stacking): 0.71
Classification Report (Stacking):
              precision    recall  f1-score   support

           0       0.11      0.01      0.01       125
           1       0.00      0.00      0.00        53
           2       0.63      0.21      0.31       117
           3       0.72      0.98      0.83       698

    accuracy                           0.71       993
   macro avg       0.37      0.30      0.29       993
weighted avg       0.60      0.71      0.62       993



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
import numpy as np

# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Best parameters obtained from GridSearchCV (including class_weight)
best_params_rf = {
    'bootstrap': True,
    'class_weight': class_weights_dict,
    'max_depth': 40,
    'max_features': 'log2',
    'min_samples_leaf': 1,
    'min_samples_split': 5,
    'n_estimators': 500
}

# Define the base models
base_models_voting = [
    ('rf', RandomForestClassifier(
        bootstrap=best_params_rf['bootstrap'],
        class_weight=best_params_rf['class_weight'],
        max_depth=best_params_rf['max_depth'],
        max_features=best_params_rf['max_features'],
        min_samples_leaf=best_params_rf['min_samples_leaf'],
        min_samples_split=best_params_rf['min_samples_split'],
        n_estimators=best_params_rf['n_estimators'],
        random_state=42
    )),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
]

# Create the voting classifier (soft voting)
voting_model = VotingClassifier(estimators=base_models_voting, voting='soft')

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation
cross_val_scores_voting = cross_val_score(voting_model, X_train_tfidf, y_train, cv=skf, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores (Voting): {cross_val_scores_voting}")
print(f"Mean Cross-Validation Accuracy (Voting): {cross_val_scores_voting.mean()}")

# Fit the voting model to the training data
voting_model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred_voting = voting_model.predict(X_test_tfidf)

# Evaluate model performance
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f"Accuracy (Voting): {accuracy_voting:.2f}")

print("Classification Report (Voting):")
print(classification_report(y_test, y_pred_voting))


Cross-Validation Accuracy Scores (Voting): [0.72413793 0.70258621 0.67672414 0.69396552 0.71551724 0.71551724
 0.71551724 0.72727273 0.71428571 0.71861472]
Mean Cross-Validation Accuracy (Voting): 0.710413867741454
Accuracy (Voting): 0.71
Classification Report (Voting):
              precision    recall  f1-score   support

           0       0.29      0.04      0.07       125
           1       0.17      0.02      0.03        53
           2       0.62      0.20      0.30       117
           3       0.72      0.97      0.83       698

    accuracy                           0.71       993
   macro avg       0.45      0.31      0.31       993
weighted avg       0.63      0.71      0.63       993

