In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np


In [2]:
def loadData(fname):
    reviews = []
    labels = []
    with open(fname, 'r') as f:
        for line in f:
            review, rating = line.strip().split('\t')  
            reviews.append(review.lower())
            labels.append(int(rating))
    return reviews, labels

In [3]:
# Load training and testing data
rev_train, labels_train = loadData('reviews_train.txt')
rev_test, labels_test = loadData('reviews_test.txt')

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(rev_train)
tfidf_train = tfidf_vectorizer.transform(rev_train)
tfidf_test = tfidf_vectorizer.transform(rev_test)


In [4]:
rf_classifier = RandomForestClassifier()
param_grid_rf = {
    'n_estimators': np.arange(100, 301, 1),
    'max_depth': [None, 10, 20]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=2, n_jobs=-1)
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=2, n_jobs=-1)
grid_search_rf.fit(tfidf_train, labels_train)

In [5]:
gb_classifier = GradientBoostingClassifier()
param_grid_gb = {
    'n_estimators': np.arange(200, 401, 1),
    'learning_rate': np.arange(0.01, 0.21, 0.1)
}
grid_search_gb = GridSearchCV(gb_classifier, param_grid_gb, cv=2, n_jobs=-1)
grid_search_gb.fit(tfidf_train, labels_train)

In [6]:
lr_classifier = LogisticRegression()
param_grid_lr = {
    'C': np.arange(0, 10.01, 0.01),
    'max_iter': [100, 500, 1000]
}
grid_search_lr = GridSearchCV(lr_classifier, param_grid_lr, cv=2, n_jobs=-1)
grid_search_lr.fit(tfidf_train, labels_train)

6 fits failed out of a total of 6006.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/accuracy/Developer/BIA 660/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/accuracy/Developer/BIA 660/.venv/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/accuracy/Developer/BIA 660/.venv/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/accuracy/Developer/BIA 660/.venv/lib/python3.11/site-packages/sk

In [7]:
nb_classifier = MultinomialNB()
param_grid_nb = {
    'alpha': np.arange(0, 30.1, 0.1),
    'force_alpha': [True, False]
}
grid_search_nb = GridSearchCV(nb_classifier, param_grid_nb, cv=2, n_jobs=-1)
grid_search_nb.fit(tfidf_train, labels_train)

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


In [8]:
svc_classifier = SVC()
param_grid_svc = {
    'C': np.arange(0.1, 10.1, 0.1),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
grid_search_svc = GridSearchCV(svc_classifier, param_grid_svc, cv=2, n_jobs=-1)
grid_search_svc.fit(tfidf_train, labels_train)

In [9]:
# Get the best models from grid search
best_rf = grid_search_rf.best_estimator_
best_gb = grid_search_gb.best_estimator_
best_lr = grid_search_lr.best_estimator_
best_nb = grid_search_nb.best_estimator_
best_svc = grid_search_svc.best_estimator_

# Create a Voting Classifier with the best models
voting_classifier = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('gb', best_gb),
    ('lr', best_lr),
    ('nb', best_nb),
    ('svc', best_svc)
], voting='hard')

# Train the Voting Classifier
voting_classifier.fit(tfidf_train, labels_train)

# Make predictions on the test set
pred = voting_classifier.predict(tfidf_test)

# Calculate accuracy
accuracy = accuracy_score(pred, labels_test)
print(accuracy)

0.8866666666666667


In [None]:
# # Print the best results for each algorithm
# print("Random Forest - Best Parameters:", grid_search_rf.best_params_)
# print("Random Forest - Best Accuracy:", grid_search_rf.best_estimator_.predict(tfidf_test))

# print("\nGradient Boosting - Best Parameters:", grid_search_gb.best_params_)
# print("Gradient Boosting - Best Accuracy:", grid_search_gb.best_estimator_.predict(tfidf_test))

# print("\nLogistic Regression - Best Parameters:", grid_search_lr.best_params_)
# print("Logistic Regression - Best Accuracy:", grid_search_lr.best_estimator_.predict(tfidf_test))

# print("\nMultinomial Naive Bayes - Best Parameters:", grid_search_nb.best_params_)
# print("Multinomial Naive Bayes - Best Accuracy:", grid_search_nb.best_estimator_.predict(tfidf_test))

# print("\nSupport Vector Classification  - Best Parameters:", grid_search_svc.best_params_)
# print("Support Vector Classification  - Best Accuracy:", grid_search_svc.best_estimator_.predict(tfidf_test))

# # Identify the best algorithm
# best_algorithm = max([
#     ("Random Forest", grid_search_rf.best_estimator_.predict(tfidf_test)),
#     ("Gradient Boosting", grid_search_gb.best_estimator_.predict(tfidf_test)),
#     ("Logistic Regression", grid_search_lr.best_estimator_.predict(tfidf_test)),
#     ("Multinomial Naive Bayes", grid_search_nb.best_estimator_.predict(tfidf_test)),
#     ("Support Vector Classification", grid_search_svc.best_estimator_.predict(tfidf_test))
# ], key=lambda x: x[1])

# # Print the best algorithm and its accuracy
# print(f"\nBest Algorithm: {best_algorithm[0]} with Accuracy: {best_algorithm[1]:.4f}")
