In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# load cleaned data
df = pd.read_csv('cleaned_data.csv')
df = df.drop('Unnamed: 0', axis=1)

In [None]:
# approach 1
# split into train and test
# based on user:  80% of user's records train, 20% of user's records test
users = df["id"].unique()

train_data = pd.DataFrame()
test_data = pd.DataFrame()

for id in users:
    user_data = df[df["id"] == id]
    n = len(user_data)
    split_index = int(np.round(n*0.8))
    train_user_data = user_data.iloc[:split_index]
    test_user_data = user_data.iloc[split_index:]
    train_data = pd.concat([train_data, train_user_data])
    test_data = pd.concat([test_data, test_user_data])

In [None]:
# approach 2
# use train_test_split 
train_set, test_set = train_test_split(df, test_size=0.2, train_size=0.8, random_state=42, shuffle=True, stratify=df['Complain'])

In [None]:
# define classes for classificaiton
classes = [1,2,3,4,5,6,7,8,9,10]

MODELING

In [None]:
# random forest classification

rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(x_train, y_train)

In [None]:
predictions = rfc.predict(x_test)

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average="weighted")
recall = recall_score(y_test, predictions, average="weighted")
f1 = f1_score(y_test, predictions, average="weighted")

confusion_mat = multilabel_confusion_matrix(y_test, predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

for c in enumerate(classes):
    print('Confusion matrix of class ' + c[-1] + ": ")
    print(confusion_mat[c[0]])

In [None]:
# random forrest regression

rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(x_train, y_train)

In [None]:
predictions = rfr.predict(x_test)

In [None]:
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print('MSE: ',mse)
print('MAE: ',mae)

Hyperparam search random forest

In [None]:
n_estimators = [10, 50, 100, 200]
max_features = ['sqrt', 'log2']
max_depth = [None, 5, 10, 20]
min_samples_split =  [2, 5, 10]
min_samples_leaf = [1, 2, 4]

best_loss = np.inf
best_params = []

for type in ['classification','regression']:
    for n_estimator in n_estimators:
        for max_feature in max_features:
            for max_d in max_depth:
                for min_samp_s in min_samples_split:
                    for min_samp_l in min_samples_leaf:
                        if type == 'classification':
                            rf = RandomForestClassifier(n_estimators=n_estimator, max_features=max_feature , max_depth=max_d , min_samples_split=min_samp_s , min_samples_leaf=min_samp_l, random_state=42)
                            rf.fit(x_train_train, y_train_train)
                            predictions = rf.predict(x_train_test)
                            f1 = f1_score(y_train_test, predictions, average="weighted")
                            if f1 < best_loss:
                                best_loss = f1
                                best_params = [n_estimator, max_feature, max_d, min_samp_s, min_samp_l]
                        if type == 'regression':
                            rf = RandomForestRegressor(n_estimators=n_estimator, max_features=max_feature , max_depth=max_d , min_samples_split=min_samp_s , min_samples_leaf=min_samp_l, random_state=42)
                            rf.fit(x_train_train, y_train_train)
                            predictions = rf.predict(x_train_test)
                            mse = mean_squared_error(y_train_test, predictions)
                            if mse < best_loss:
                                best_loss = mse
                                best_params = [n_estimator, max_feature, max_d, min_samp_s, min_samp_l]
                        
    print(best_params)
    if type == 'classification':
        rf = RandomForestClassifier(n_estimators=best_params[0], max_features=best_params[1] , max_depth=best_params[2] , min_samples_split=best_params[3] , min_samples_leaf=best_params[4], random_state=42)
        rf.fit(x_train, y_train)
        predictions = rfc.predict(x_test)

        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions, average="weighted")
        recall = recall_score(y_test, predictions, average="weighted")
        f1 = f1_score(y_test, predictions, average="weighted")

        confusion_mat = multilabel_confusion_matrix(y_test, predictions)

        print('Accuracy:', accuracy)
        print('Precision:', precision)
        print('Recall:', recall)
        print('F1-score:', f1)

        for c in enumerate(classes):
            print('Confusion matrix of class ' + c[-1] + ": ")
            print(confusion_mat[c[0]])

    if type == 'regression':
        rf = RandomForestRegressor(n_estimators=best_params[0], max_features=best_params[1] , max_depth=best_params[2] , min_samples_split=best_params[3] , min_samples_leaf=best_params[4], random_state=42)
        rf.fit(x_train, y_train)
        predictions = rfc.predict(x_test)

        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)

        print('MSE: ',mse)
        print('MAE: ',mae)