In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [4]:
# Load dataset
df = pd.read_csv('reviews_dataset.csv')

In [5]:
# Text Preprocessing
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['ReviewContent'] = df['ReviewContent'].apply(remove_punctuation)

In [6]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

df['ReviewContent'] = df['ReviewContent'].apply(remove_stopwords)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

df['ReviewContent'] = df['ReviewContent'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['ReviewContent'])

In [9]:
# Encode categorical variables
label_encoder = LabelEncoder()
df['Category_encoded'] = label_encoder.fit_transform(df['Category'])
df['Generation_encoded'] = label_encoder.fit_transform(df['Generation'])
df['Sentiment_encoded'] = label_encoder.fit_transform(df['Sentiment'])
df['Country_encoded'] = label_encoder.fit_transform(df['Country'])

In [29]:
# Split data into training and test sets with stratified sampling
X = tfidf_matrix
y = df['Sentiment_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [26]:
# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Define the pipeline with SMOTE and SVM
pipeline_svm = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('svc', SVC())
])

# Define parameters for hyperparameter tuning
param_grid_svm = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': ['scale', 'auto'],
    'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# Perform RandomizedSearchCV
svm_random_search = RandomizedSearchCV(pipeline_svm, param_distributions=param_grid_svm, n_iter=20, scoring='accuracy', cv=5, verbose=1, random_state=42)
svm_random_search.fit(X_train, y_train)

# Get the best model and evaluate on test set
best_svm = svm_random_search.best_estimator_
y_pred_svm = best_svm.predict(X_test)

# Print best parameters and classification report
print("Best Parameters (SVM):", svm_random_search.best_params_)
print("Accuracy (SVM):", svm_random_search.best_score_)
print("Classification Report (SVM):")
print(classification_report(y_test, y_pred_svm))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters (SVM): {'svc__kernel': 'rbf', 'svc__gamma': 'scale', 'svc__C': 10}
Accuracy (SVM): 0.7026383406568854
Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.45      0.08      0.14       125
           1       0.10      0.02      0.03        53
           2       0.50      0.15      0.24       117
           3       0.72      0.95      0.82       698

    accuracy                           0.70       993
   macro avg       0.44      0.30      0.31       993
weighted avg       0.63      0.70      0.62       993



In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Define parameters for hyperparameter tuning
param_grid_rf = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

# Perform RandomizedSearchCV
rf_random_search = RandomizedSearchCV(rf_model, param_distributions=param_grid_rf, n_iter=100, scoring='accuracy', cv=5, verbose=1, random_state=42, n_jobs=-1)
rf_random_search.fit(X_train, y_train)

# Get the best model and evaluate on test set
best_rf = rf_random_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

# Print best parameters and classification report
print("Best Parameters (Random Forest):", rf_random_search.best_params_)
print("Accuracy (Random Forest):", rf_random_search.best_score_)
print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))


Fitting 5 folds for each of 100 candidates, totalling 500 fits


250 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
152 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\arjun\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\arjun\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\arjun\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\arjun\AppData\Roaming\Python\Python311\site-packages\sklearn\utils\_param_valida

Best Parameters (Random Forest): {'n_estimators': 2000, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}
Accuracy (Random Forest): 0.7173149251508155
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       125
           1       0.00      0.00      0.00        53
           2       0.65      0.21      0.31       117
           3       0.72      0.98      0.83       698

    accuracy                           0.71       993
   macro avg       0.34      0.30      0.29       993
weighted avg       0.58      0.71      0.62       993



In [20]:
import xgboost as xgb

# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters for XGBoost
param_grid_xgb = {
    'max_depth': [3, 4, 5, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'n_estimators': [100, 200, 300, 400],
    'colsample_bytree': [0.3, 0.4, 0.5, 0.7, 0.8, 1.0],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# Perform RandomizedSearchCV with XGBoost
xgb_random_search = RandomizedSearchCV(estimator=xgb.XGBClassifier(objective="multi:softmax", num_class=4, seed=42),
                                       param_distributions=param_grid_xgb, n_iter=50,
                                       scoring='accuracy', n_jobs=-1, cv=5, verbose=1, random_state=42)
xgb_random_search.fit(X_train, y_train)

# Get the best model and evaluate on test set
best_xgb = xgb_random_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

# Print best parameters and classification report
print("Best Parameters (XGBoost):", xgb_random_search.best_params_)
print("Accuracy (XGBoost):", xgb_random_search.best_score_)
print("Classification Report (XGBoost):")
print(classification_report(y_test, y_pred_xgb))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters (XGBoost): {'subsample': 0.9, 'n_estimators': 100, 'max_depth': 8, 'learning_rate': 0.05, 'colsample_bytree': 0.5}
Accuracy (XGBoost): 0.7142855812914277
Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.35      0.05      0.08       125
           1       0.00      0.00      0.00        53
           2       0.60      0.21      0.31       117
           3       0.73      0.97      0.83       698

    accuracy                           0.71       993
   macro avg       0.42      0.31      0.31       993
weighted avg       0.63      0.71      0.63       993



In [27]:
from sklearn.naive_bayes import MultinomialNB

# Initialize Naive Bayes model
nb_model = MultinomialNB()

# No hyperparameter tuning for Naive Bayes, just fit and predict
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Print classification report
print("Classification Report (Naive Bayes):")
print(classification_report(y_test, y_pred_nb))


Classification Report (Naive Bayes):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       125
           1       0.00      0.00      0.00        53
           2       0.00      0.00      0.00       117
           3       0.70      1.00      0.83       698

    accuracy                           0.70       993
   macro avg       0.18      0.25      0.21       993
weighted avg       0.49      0.70      0.58       993



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly'],
    'class_weight': ['balanced', None]  # Try both balanced and None
}

# Initialize SVC classifier
svc = SVC()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=svc, param_distributions=param_grid, scoring='accuracy', 
                                   cv=5, n_iter=25, random_state=42, verbose=2, n_jobs=-1)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters and estimator
best_params = random_search.best_params_
best_estimator = random_search.best_estimator_

print(f"Best Parameters: {best_params}")

# Predict on test data
y_pred = best_estimator.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'class_weight': None, 'C': 100}
Accuracy: 0.72
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.10      0.16       125
           1       0.12      0.02      0.03        53
           2       0.64      0.21      0.32       117
           3       0.73      0.96      0.83       698

    accuracy                           0.72       993
   macro avg       0.49      0.32      0.34       993
weighted avg       0.66      0.72      0.64       993



In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd

# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [class_weights_dict, 'balanced', None]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, scoring='accuracy', 
                                      cv=5, n_iter=50, random_state=42, verbose=2, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_rf.fit(X_train_tfidf, y_train)

# Best parameters and estimator
best_params_rf = random_search_rf.best_params_
best_estimator_rf = random_search_rf.best_estimator_

print(f"Best Parameters (Random Forest): {best_params_rf}")

# Predict on test data
y_pred_rf = best_estimator_rf.predict(X_test_tfidf)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy (Random Forest): {accuracy_rf:.2f}")

print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters (Random Forest): {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40, 'class_weight': {0: 1.9891826923076923, 1: 4.675141242937853, 2: 2.121794871794872, 3: 0.3556080790717662}, 'bootstrap': True}
Accuracy (Random Forest): 0.73
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.46      0.10      0.17       125
           1       0.25      0.02      0.04        53
           2       0.70      0.33      0.45       117
           3       0.74      0.97      0.84       698

    accuracy                           0.73       993
   macro avg       0.54      0.36      0.37       993
weighted avg       0.68      0.73      0.67       993



In [36]:
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd

# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'scale_pos_weight': [class_weights_dict[i] for i in range(len(class_weights))],
    'objective': ['multi:softmax']
}

# Initialize XGBClassifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False)

# Initialize RandomizedSearchCV
random_search_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid, scoring='accuracy', 
                                       cv=5, n_iter=50, random_state=42, verbose=2, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_xgb.fit(X_train_tfidf, y_train)

# Best parameters and estimator
best_params_xgb = random_search_xgb.best_params_
best_estimator_xgb = random_search_xgb.best_estimator_

print(f"Best Parameters (XGBoost): {best_params_xgb}")

# Predict on test data
y_pred_xgb = best_estimator_xgb.predict(X_test_tfidf)

# Evaluate model performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy (XGBoost): {accuracy_xgb:.2f}")

print("Classification Report (XGBoost):")
print(classification_report(y_test, y_pred_xgb))


Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



Best Parameters (XGBoost): {'subsample': 0.6, 'scale_pos_weight': 4.675141242937853, 'objective': 'multi:softmax', 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.01, 'colsample_bytree': 0.8}
Accuracy (XGBoost): 0.70
Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.14      0.01      0.02       125
           1       0.00      0.00      0.00        53
           2       0.50      0.11      0.18       117
           3       0.71      0.98      0.83       698

    accuracy                           0.70       993
   macro avg       0.34      0.28      0.26       993
weighted avg       0.58      0.70      0.60       993



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Define the parameter grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]
}

# Initialize MultinomialNB
nb = MultinomialNB()

# Initialize RandomizedSearchCV
random_search_nb = RandomizedSearchCV(estimator=nb, param_distributions=param_grid, scoring='accuracy', 
                                      cv=5, n_iter=50, random_state=42, verbose=2, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_nb.fit(X_train_tfidf, y_train)

# Best parameters and estimator
best_params_nb = random_search_nb.best_params_
best_estimator_nb = random_search_nb.best_estimator_

print(f"Best Parameters (Naive Bayes): {best_params_nb}")

# Predict on test data
y_pred_nb = best_estimator_nb.predict(X_test_tfidf)

# Evaluate model performance
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy (Naive Bayes): {accuracy_nb:.2f}")

print("Classification Report (Naive Bayes):")
print(classification_report(y_test, y_pred_nb))


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters (Naive Bayes): {'alpha': 0.1}
Accuracy (Naive Bayes): 0.72
Classification Report (Naive Bayes):
              precision    recall  f1-score   support

           0       0.44      0.03      0.06       125
           1       0.00      0.00      0.00        53
           2       0.72      0.22      0.34       117
           3       0.72      0.98      0.83       698

    accuracy                           0.72       993
   macro avg       0.47      0.31      0.31       993
weighted avg       0.65      0.72      0.63       993



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [class_weights_dict, 'balanced', None]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', 
                              cv=5, verbose=2, n_jobs=-1)

# Fit GridSearchCV
grid_search_rf.fit(X_train_tfidf, y_train)

# Best parameters and estimator
best_params_rf = grid_search_rf.best_params_
best_estimator_rf = grid_search_rf.best_estimator_

print(f"Best Parameters (Random Forest): {best_params_rf}")

# Predict on test data
y_pred_rf = best_estimator_rf.predict(X_test_tfidf)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy (Random Forest): {accuracy_rf:.2f}")

print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))


Fitting 5 folds for each of 2700 candidates, totalling 13500 fits
Best Parameters (Random Forest): {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 40, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
Accuracy (Random Forest): 0.73
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.46      0.09      0.15       125
           1       0.00      0.00      0.00        53
           2       0.67      0.38      0.49       117
           3       0.74      0.96      0.83       698

    accuracy                           0.73       993
   macro avg       0.47      0.36      0.37       993
weighted avg       0.66      0.73      0.66       993



In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle
import numpy as np
import pandas as pd

# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Define the best parameters
best_params_rf = {
    'n_estimators': 500,
    'min_samples_split': 5,
    'min_samples_leaf': 1,
    'max_features': 'log2',
    'max_depth': 40,
    'class_weight': {0: 1.9891826923076923, 1: 4.675141242937853, 2: 2.121794871794872, 3: 0.3556080790717662},
    'bootstrap': True
}

# Initialize RandomForestClassifier with the best parameters
best_estimator_rf = RandomForestClassifier(**best_params_rf, random_state=42)

# Fit the model
best_estimator_rf.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred_rf = best_estimator_rf.predict(X_test_tfidf)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy (Random Forest): {accuracy_rf:.2f}")

print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))


Accuracy (Random Forest): 0.72
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.30      0.07      0.12       125
           1       0.00      0.00      0.00        53
           2       0.64      0.30      0.41       117
           3       0.74      0.96      0.83       698

    accuracy                           0.72       993
   macro avg       0.42      0.33      0.34       993
weighted avg       0.63      0.72      0.65       993



In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd

# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [class_weights_dict, 'balanced', None]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, scoring='accuracy', 
                                      cv=5, n_iter=50, random_state=42, verbose=2, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_rf.fit(X_train_tfidf, y_train)

# Best parameters and estimator
best_params_rf = random_search_rf.best_params_
best_estimator_rf = random_search_rf.best_estimator_

print(f"Best Parameters (Random Forest): {best_params_rf}")

# Predict on test data
y_pred_rf = best_estimator_rf.predict(X_test_tfidf)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy (Random Forest): {accuracy_rf:.2f}")

print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters (Random Forest): {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 30, 'class_weight': {0: 1.9891826923076923, 1: 4.675141242937853, 2: 2.121794871794872, 3: 0.3556080790717662}, 'bootstrap': True}
Accuracy (Random Forest): 0.73
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.50      0.14      0.22       125
           1       0.00      0.00      0.00        53
           2       0.60      0.41      0.49       117
           3       0.76      0.94      0.84       698

    accuracy                           0.73       993
   macro avg       0.46      0.37      0.39       993
weighted avg       0.66      0.73      0.67       993



In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
# Assuming the data is preprocessed and loaded into the DataFrame df

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Define features and target variable
X = df['ReviewContent']
y = df['Sentiment_encoded']

# Stratified split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = {0: 1.9891826923076923, 1: 4.675141242937853, 2: 2.121794871794872, 3: 0.3556080790717662}

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [class_weights_dict, 'balanced', None]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, scoring='accuracy', 
                                      cv=5, n_iter=50, random_state=42, verbose=2, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_rf.fit(X_train_tfidf, y_train)

# Best parameters and estimator
best_params_rf = random_search_rf.best_params_
best_estimator_rf = random_search_rf.best_estimator_

print(f"Best Parameters (Random Forest): {best_params_rf}")

# Predict on test data
y_pred_rf = best_estimator_rf.predict(X_test_tfidf)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy (Random Forest): {accuracy_rf:.2f}")

print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters (Random Forest): {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 30, 'class_weight': {0: 1.9891826923076923, 1: 4.675141242937853, 2: 2.121794871794872, 3: 0.3556080790717662}, 'bootstrap': True}
Accuracy (Random Forest): 0.72
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.39      0.11      0.17       125
           1       0.00      0.00      0.00        53
           2       0.60      0.35      0.44       117
           3       0.74      0.94      0.83       698

    accuracy                           0.72       993
   macro avg       0.43      0.35      0.36       993
weighted avg       0.64      0.72      0.66       993



In [11]:
y_pred_rf


array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3,
       3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 1,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 0, 2,
       2, 3, 3, 2, 0, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,