<a href="https://colab.research.google.com/github/Abdibaset/ML_CS74/blob/main/MultiClassClassification_Project3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Metrics**

In [231]:
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score
import numpy as np

def evaluationMetrics(y_actual, y_predicted, valid_set=None, model=None):
  y_actual = np.array(y_actual).reshape(-1, 1)  # Flatten the array
  y_predicted = np.array(y_predicted).reshape(-1, 1)

  if model and valid_set is not None:
    y_predicted_probabilities = model.predict_proba(valid_set)
  else:
    y_predicted_probabilities = None

  ## get metric scores
  confusion_matrix_result = confusion_matrix(y_actual, y_predicted)
  roc_auc_per_class = roc_auc_score(y_actual, y_predicted_probabilities, multi_class="ovr", average=None)
  macro_roc_auc = roc_auc_score(y_actual, y_predicted_probabilities, multi_class="ovr", average="macro")
  accuracy_result = accuracy_score(y_actual, y_predicted)
  f1_score_result = f1_score(y_actual, y_predicted, average="macro")

  print("Confusion Matrix: ", confusion_matrix_result)
  print("roc_auc per class: ", roc_auc_per_class)
  print("macro_roc_auc", macro_roc_auc)
  print("accuracy: ", accuracy_result)
  print("f1_score: ", f1_score_result)


# **Preprocessing data**

In [191]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data = train_data[['summary', 'reviewText', 'overall', 'verified', 'category']]
test_data = test_data[['summary', 'reviewText', 'id', 'verified', 'category']]

important_text_features_train = train_data['summary'].fillna('') + ' ' + train_data['reviewText'].fillna('')
important_text_features_test =  test_data['summary'].fillna('')+ ' ' + test_data['reviewText'].fillna('')

target_label = train_data['overall']
train_data['overall'].value_counts()

2    5959
1    5957
3    5862
4    5769
5    5642
Name: overall, dtype: int64

In [192]:
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer


nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
  tokens = word_tokenize(text) # tokenize with nltk
  tokens = [w.lower() for w in tokens] # convert to all lower case

  # stem and then lemmatize data
  tokens = [lemmatizer.lemmatize(stemmer.stem(w)) for w in tokens if  w not in string.punctuation]
  return ' '.join(tokens)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [193]:
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorization of data
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    sublinear_tf=True
)

## fit into the vectorizer
train_text_features = vectorizer.fit_transform(important_text_features_train)
test_text_features = vectorizer.transform(important_text_features_test)

## split the data while maintaining the distribution
train_set, validation_set, train_target, validation_target = train_test_split(
    train_text_features, target_label, test_size=0.2, random_state=27, stratify=target_label
)


# **Logistic Regression Model**

In [195]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


logreg = LogisticRegression()
params = {
    "multi_class": ["ovr"],
    "C": [2.5, 3.5, 4.0],
    "solver": ['lbfgs', 'saga'],
    "class_weight": ['balanced'],
    "max_iter": [1000, 1500],
    "penalty": ['l2']
}

## cross-validation with gridsearchcv
grid = GridSearchCV(logreg, params, cv=5, scoring="f1_macro")
grid.fit(train_set, train_target)
best_log = grid.best_estimator_

# Fit the best estimator on the training data
best_log.fit(train_set, train_target)
validation_pred = best_log.predict(validation_set) # predict on the validation set

# print report of predictions of best classifier on the validation set
print(classification_report(validation_target, validation_pred))
evaluationMetrics(validation_target, validation_pred, validation_set, best_log)

## predict test and save data
test_predicted = best_log.predict(test_text_features)
logReg_submission = {'pred': test_predicted, 'id':test_data['id']}
logReg_submission_df = pd.DataFrame(logReg_submission)
logReg_submission_df.to_csv('log_reg.csv')

              precision    recall  f1-score   support

           1       0.65      0.72      0.68      1192
           2       0.53      0.49      0.51      1192
           3       0.55      0.53      0.54      1172
           4       0.57      0.56      0.57      1154
           5       0.71      0.74      0.72      1128

    accuracy                           0.61      5838
   macro avg       0.60      0.61      0.60      5838
weighted avg       0.60      0.61      0.60      5838

Confusion Matrix:  [[854 218  61  37  22]
 [280 583 244  61  24]
 [101 205 618 197  51]
 [ 35  65 166 652 236]
 [ 38  29  41 190 830]]
accuracy:  0.605858170606372
f1_score:  0.6045217202485745


# **Multinomial NaiveBayes Model**

In [229]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pandas as pd


Multi_NB = MultinomialNB()
params = {
    "alpha": [0.1, 0.5, 1.0, 2.0],
    "fit_prior": [True, False]
}

# Cross-validation with grid search
grid = GridSearchCV(Multi_NB, params, cv=5, scoring="f1_macro")
grid.fit(train_set, train_target)
best_multi_nb = grid.best_estimator_
best_multi_nb.fit(train_set, train_target)
validation_pred = best_multi_nb.predict(validation_set)

## print report on the validation predictions
print("Classification Report for Multinomial Naive Bayes:\n", classification_report(validation_target, validation_pred))
evaluationMetrics(validation_target, validation_pred, validation_set, best_multi_nb)

# Predict on the test set
test_predicted = best_multi_nb.predict(test_text_features)

# Save predictions to a CSV file
mnb_submission = {'pred': test_predicted, 'id': test_data['id']}
mnb_submission_df = pd.DataFrame(mnb_submission)
mnb_submission_df.to_csv('multinomial_nb_submission.csv')


Classification Report for Multinomial Naive Bayes:
               precision    recall  f1-score   support

           1       0.73      0.59      0.65      1192
           2       0.44      0.61      0.51      1192
           3       0.48      0.52      0.50      1172
           4       0.55      0.61      0.58      1154
           5       0.86      0.51      0.64      1128

    accuracy                           0.57      5838
   macro avg       0.61      0.57      0.58      5838
weighted avg       0.61      0.57      0.58      5838



# **Random forest Model**

In [237]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd

rf_classifier = RandomForestClassifier(random_state=27)
params = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [5],
    # 'class_weight': [{1: 1, 2: 2, 3: 1.5, 4: 2.5, 5: 1}]
}


grid = GridSearchCV(rf_classifier, params, cv=5, scoring="f1_macro")
grid.fit(train_set, train_target)
best_rf = grid.best_estimator_
best_rf.fit(train_set, train_target)

validation_pred = best_rf.predict(validation_set)
print("Classification Report for Random Forest:\n", classification_report(validation_target, validation_pred))
evaluationMetrics(validation_target, validation_pred, validation_set, best_rf)

# Predict on the test set
test_predicted = best_rf.predict(test_text_features)

# Save predictions to a CSV file
rf_submission = {'pred': test_predicted, 'id': test_data['id']}
rf_submission_df = pd.DataFrame(rf_submission)
rf_submission_df.to_csv('random_forest_submission.csv')


Classification Report for Random Forest:
               precision    recall  f1-score   support

           1       0.41      0.72      0.52      1192
           2       0.32      0.41      0.36      1192
           3       0.49      0.41      0.45      1172
           4       0.47      0.20      0.28      1154
           5       0.69      0.43      0.53      1128

    accuracy                           0.44      5838
   macro avg       0.47      0.43      0.43      5838
weighted avg       0.47      0.44      0.43      5838

