In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import pandas as pd
import matplotlib.pyplot as plt

"Implementing a Random Forest Model & Performing HyperParameter Tuning to find optimal hyperparameters"

df = pd.read_csv('../data/preprocessed_data.csv')
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Sentiment'], test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5,
                           n_jobs=-1)

grid_search.fit(X_train_vec, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_rf_model = RandomForestClassifier(**best_params, random_state=42)
best_rf_model.fit(X_train_vec, y_train)
y_pred = best_rf_model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

joblib.dump(best_rf_model, '../models/best_sentiment_analysis_model.pkl')
joblib.dump(vectorizer, '../models/best_tfidf_vectorizer.pkl')

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
              precision    recall  f1-score   support

          -1       0.37      0.47      0.41        38
           0       0.00      0.00      0.00        31
           1       0.78      0.86      0.81       195

    accuracy                           0.70       264
   macro avg       0.38      0.44      0.41       264
weighted avg       0.63      0.70      0.66       264



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['../models/best_tfidf_vectorizer.pkl']