In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# load cleaned data
df = pd.read_csv('feature_nans_removed.csv')
df = df.drop('Unnamed: 0', axis=1)

In [3]:
df = df.sort_values(by="date")

In [4]:
df["date"] = (pd.to_datetime(df["date"]) - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")

In [5]:
df = df.drop(df[df['activity'].isna()].index)

In [6]:
df = df.drop(df[df['circumplex.valence'].isna()].index)

In [7]:
# one hot encode into 5 classes
classes = ['Very sad', 'Sad', 'Neutral', 'Happy', 'Very happy']
df['mood class'] = pd.cut(df['mood'], bins=5, labels=classes)
one_hot_encoded = pd.get_dummies(df['mood class'])
df = pd.concat([df, one_hot_encoded], axis=1)
#df.drop(['mood', 'mood class'], axis=1, inplace=True)

In [8]:
# approach 1
# split into train and test
# based on user:  80% of user's records train, 20% of user's records test
users = df["id"].unique()

train_data = pd.DataFrame()
test_data = pd.DataFrame()

for id in users:
    user_data = df[df["id"] == id]
    n = len(user_data)
    split_index = int(np.round(n*0.8))
    train_user_data = user_data.iloc[:split_index]
    test_user_data = user_data.iloc[split_index:]
    train_data = pd.concat([train_data, train_user_data])
    test_data = pd.concat([test_data, test_user_data])

In [9]:
# Use pandas get_dummies to one-hot encode the user IDs
one_hot_encoded_train = pd.get_dummies(train_data['id'], prefix='user_id')
one_hot_encoded_test = pd.get_dummies(test_data['id'], prefix='user_id')

train_data = pd.concat([train_data, one_hot_encoded_train], axis=1)
train_data = train_data.drop("id", axis=1)

test_data = pd.concat([test_data, one_hot_encoded_test], axis=1)
test_data = test_data.drop("id", axis=1)

In [10]:
train_data.columns

Index(['date', 'mood', 'circumplex.arousal', 'circumplex.valence', 'activity',
       'screen', 'appCat.builtin', 'appCat.communication',
       'appCat.entertainment', 'appCat.game', 'appCat.office', 'appCat.other',
       'appCat.social', 'mood class', 'Very sad', 'Sad', 'Neutral', 'Happy',
       'Very happy', 'user_id_AS14.01', 'user_id_AS14.02', 'user_id_AS14.03',
       'user_id_AS14.05', 'user_id_AS14.06', 'user_id_AS14.07',
       'user_id_AS14.08', 'user_id_AS14.09', 'user_id_AS14.12',
       'user_id_AS14.13', 'user_id_AS14.14', 'user_id_AS14.15',
       'user_id_AS14.16', 'user_id_AS14.17', 'user_id_AS14.19',
       'user_id_AS14.20', 'user_id_AS14.23', 'user_id_AS14.24',
       'user_id_AS14.25', 'user_id_AS14.26', 'user_id_AS14.27',
       'user_id_AS14.28', 'user_id_AS14.29', 'user_id_AS14.30',
       'user_id_AS14.31', 'user_id_AS14.32', 'user_id_AS14.33'],
      dtype='object')

In [11]:
# data sets for classification task
y_test_classification, y_train_classification = test_data[classes], train_data[classes]
x_test_classification, x_train_classification = test_data.drop(classes + ['mood','mood class'], axis=1), train_data.drop(classes + ['mood','mood class'], axis=1)

In [12]:
# data sets for regression task
y_test_regression, y_train_regression = test_data['mood'], train_data['mood']
x_test_regression, x_train_regression = test_data.drop(classes + ['mood','mood class'], axis=1), train_data.drop(classes + ['mood','mood class'], axis=1)

In [25]:
# approach 2
# use train_test_split 
# train_set, test_set = train_test_split(df, test_size=0.2, train_size=0.8, random_state=42, shuffle=True, stratify=df['Complain'])

MODELING

In [14]:
# random forest classification

rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(x_train_classification, y_train_classification)

RandomForestClassifier(random_state=42)

In [15]:
predictions = rfc.predict(x_test_classification)

In [16]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test_classification, predictions)
precision = precision_score(y_test_classification, predictions, average="weighted")
recall = recall_score(y_test_classification, predictions, average="weighted")
f1 = f1_score(y_test_classification, predictions, average="weighted")

confusion_mat = multilabel_confusion_matrix(y_test_classification, predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

for c in enumerate(classes):
    print('Confusion matrix of class ' + c[-1] + ": ")
    print(confusion_mat[c[0]])

Accuracy: 0.7248908296943232
Precision: 0.7104042174212826
Recall: 0.7248908296943232
F1-score: 0.7155256387922728
Confusion matrix of class Very sad: 
[[229   0]
 [  0   0]]
Confusion matrix of class Sad: 
[[221   0]
 [  8   0]]
Confusion matrix of class Neutral: 
[[136  19]
 [ 29  45]]
Confusion matrix of class Happy: 
[[ 59  31]
 [ 18 121]]
Confusion matrix of class Very happy: 
[[221   0]
 [  8   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(


In [17]:
# random forrest regression

rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(x_train_regression, y_train_regression)

RandomForestRegressor(random_state=42)

In [18]:
predictions = rfr.predict(x_test_regression)

In [19]:
mse = mean_squared_error(y_test_regression, predictions)
mae = mean_absolute_error(y_test_regression, predictions)

print('MSE: ',mse)
print('MAE: ',mae)

MSE:  0.2041379270984958
MAE:  0.352664483260553


Hyperparam search random forest

In [None]:
n_estimators = [10, 50, 100, 200]
max_features = ['sqrt', 'log2']
max_depth = [None, 5, 10, 20]
min_samples_split =  [2, 5, 10]
min_samples_leaf = [1, 2, 4]

best_loss = np.inf
best_params = []

for type in ['classification','regression']:
    if type == 'classification':
        x_train = x_train_classification
        y_train = y_train_classification
        x_test = x_test_classification
        y_test = y_test_classification
    elif type == 'regression':
        x_train = x_train_regression
        y_train = y_train_regression
        x_test = x_test_regression
        y_test = y_test_regression
    for n_estimator in n_estimators:
        for max_feature in max_features:
            for max_d in max_depth:
                for min_samp_s in min_samples_split:
                    for min_samp_l in min_samples_leaf:
                        if type == 'classification':
                            rf = RandomForestClassifier(n_estimators=n_estimator, max_features=max_feature , max_depth=max_d , min_samples_split=min_samp_s , min_samples_leaf=min_samp_l, random_state=42)
                            rf.fit(x_train, y_train)
                            predictions = rf.predict(x_test)
                            f1 = f1_score(y_test, predictions, average="weighted")
                            if f1 < best_loss:
                                best_loss = f1
                                best_params = [n_estimator, max_feature, max_d, min_samp_s, min_samp_l]
                        if type == 'regression':
                            rf = RandomForestRegressor(n_estimators=n_estimator, max_features=max_feature , max_depth=max_d , min_samples_split=min_samp_s , min_samples_leaf=min_samp_l, random_state=42)
                            rf.fit(x_train, y_train)
                            predictions = rf.predict(x_test)
                            mse = mean_squared_error(y_test, predictions)
                            if mse < best_loss:
                                best_loss = mse
                                best_params = [n_estimator, max_feature, max_d, min_samp_s, min_samp_l]
                        
    print(best_params)
    if type == 'classification':
        rf = RandomForestClassifier(n_estimators=best_params[0], max_features=best_params[1] , max_depth=best_params[2] , min_samples_split=best_params[3] , min_samples_leaf=best_params[4], random_state=42)
        rf.fit(x_train, y_train)
        predictions = rfc.predict(x_test)

        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions, average="weighted")
        recall = recall_score(y_test, predictions, average="weighted")
        f1 = f1_score(y_test, predictions, average="weighted")

        confusion_mat = multilabel_confusion_matrix(y_test, predictions)

        print('Accuracy:', accuracy)
        print('Precision:', precision)
        print('Recall:', recall)
        print('F1-score:', f1)

        for c in enumerate(classes):
            print('Confusion matrix of class ' + c[-1] + ": ")
            print(confusion_mat[c[0]])

    if type == 'regression':
        rf = RandomForestRegressor(n_estimators=best_params[0], max_features=best_params[1] , max_depth=best_params[2] , min_samples_split=best_params[3] , min_samples_leaf=best_params[4], random_state=42)
        rf.fit(x_train, y_train)
        predictions = rfc.predict(x_test)

        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)

        print('MSE: ',mse)
        print('MAE: ',mae)