In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import data
df = pd.read_csv('df_nontemporal.csv')
df.fillna(0,inplace=True)

In [2]:
# Split data
from sklearn.model_selection import train_test_split

X = df.iloc[:,np.arange(4,len(df.columns)).tolist()]
t = df['mood']

X_train, X_test, t_train, t_test = train_test_split(X, t, test_size = 0.25, random_state = 42)

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Initiate model
rf = RandomForestRegressor(n_estimators = 10, random_state = 42)
rf.fit(X_train, t_train)
t_pred=rf.predict(X_test)
MSE=mean_squared_error(t_test,t_pred)
print(MSE)

0.047319841634501


In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [40, 60, 80, 100],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 4],
    'min_samples_split': [2, 4],
    'n_estimators': [200, 500, 1000]
}

# Instantiate the grid search model
rf_grid = GridSearchCV(estimator = RandomForestRegressor(), param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 10)

rf_grid.fit(X_train, t_train.values.ravel())
print(rf_grid.best_params_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   42.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   48.8s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   59.7s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  1.4min


In [None]:
t_pred=rf_grid.best_estimator_.predict(X_test)
MSE=mean_squared_error(t_test.values.ravel(),t_pred)
print(MSE)

In [32]:
# Feature importance
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
mood_3dayavg,0.120321
appCat.communication_3dayavg,0.119253
screen_3dayavg,0.102896
circumplex.valence_3dayavg,0.096536
appCat.convenience_3dayavg,0.095772
circumplex.arousal_3dayavg,0.092729
appCat.recreation_3dayavg,0.084624
activity_3dayavg,0.075517
appCat.professional_3dayavg,0.06289
sms_3dayavg,0.062495


In [33]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# simple fit
AdaB = AdaBoostRegressor(
    base_estimator = DecisionTreeRegressor(),
    n_estimators=50, 
    learning_rate=1.0,
)
AdaB.fit(X_train,t_train.values.ravel())

# sample param_grid for gridsearch later
# note the base_estimator__ to tune the decision tree hyper parameters
param_grid = {
    "base_estimator__max_depth" : [1,2,5,None],
    "n_estimators": [1,2,10,20,50],
}


In [34]:
t_pred=AdaB.predict(X_test)
MSE=mean_squared_error(t_test.values.ravel(),t_pred)
print(MSE)

0.05145121868688249


In [35]:
AdaB_grid = GridSearchCV(estimator = AdaBoostRegressor(base_estimator=DecisionTreeRegressor()), param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 5)
AdaB_grid.fit(X_train, t_train.values.ravel())
print(AdaB_grid.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of  60 | elapsed:    0.0s remaining:    0.4s


{'base_estimator__max_depth': 2, 'n_estimators': 50}


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.8s finished


In [36]:
t_pred=AdaB_grid.best_estimator_.predict(X_test)
MSE=mean_squared_error(t_test.values.ravel(),t_pred)
print(MSE)

0.04886837699116784
