In [22]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [23]:
from scipy import stats
from statsmodels.tsa.stattools import pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression


In [24]:
import sys
sys.path.append("..")

from src.config import *

## Import clean data

In [25]:
# Read data
data_path = os.path.join(DATA_CLEAN_PATH, "ml-curated-data.csv")
dfCurated = pd.read_csv(data_path)
dfCurated.head()

Unnamed: 0,lag_11,lag_7,lag_6,lag_3,lag_2,lag_1,state,gender,age,wage_increase
0,0.024181,-0.053836,-0.023294,-0.087671,0.059876,0.032627,0.015672,0.01909,0.016816,0.075232
1,0.001615,-0.023294,-0.063004,0.059876,0.032627,0.075232,0.015672,0.01909,0.016816,-0.021322
2,0.002881,-0.063004,0.131306,0.032627,0.075232,-0.021322,0.015672,0.01909,0.016816,-0.023162
3,0.093041,0.131306,-0.087671,0.075232,-0.021322,-0.023162,0.015672,0.01909,0.016816,-0.028393
4,-0.053836,-0.087671,0.059876,-0.021322,-0.023162,-0.028393,0.015672,0.01909,0.016816,0.028896


In [26]:
target_col = "wage_increase"
features = [c for c in dfCurated.columns if c != target_col]

train = dfCurated.sample(frac=0.7)
test = dfCurated.drop(train.index)

In [27]:
train_x = train.drop(target_col, 1)
train_y = train.drop(features, 1)

test_x = test.drop(target_col, 1)
test_y = test.drop(features, 1)

In [28]:
regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
regr.fit(train_x, np.ravel(train_y)) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [29]:
estimates = regr.predict(train_x)
error = np.asmatrix(train_y.values - estimates)
sme = (error.T * error / len(error)).tolist()[0][0]
sme

0.00962477555145811

In [30]:
np.sqrt(sme)

0.09810594044938416

In [39]:
def get_random_params():
    return {
        "n_estimators": random.choice((range(10, 100))),
        "criterion": random.choice(["mse", "mae"]),
        "max_depth": random.choice(list(range(1, 10)) + [None]),
        "random_state": random.choice((range(10, 100)))
    }

param = get_random_params()
param


{'criterion': 'mse', 'max_depth': 2, 'n_estimators': 71, 'random_state': 71}

In [40]:
def get_rsme(df, param, target_col, features):
    train = df.sample(frac=0.7)
    test = df.drop(train.index)
    train_x = train.drop(target_col, 1)
    train_y = train.drop(features, 1)
    test_x = test.drop(target_col, 1)
    test_y = test.drop(features, 1)
    model= RandomForestRegressor(**param)
    model.fit(train_x, np.ravel(train_y))
    estimates = model.predict(train_x)
    error = np.asmatrix(train_y.values - estimates)
    sme = (error.T * error / len(error)).tolist()[0][0]
    return np.sqrt(sme)

In [41]:
get_rsme(dfCurated, param, target_col="wage_increase", features=[c for c in dfCurated.columns if c != "wage_increase"])

0.09537525672600237

In [None]:
result = []
for i in range(1000):
    param = get_random_params()
    rsme = get_rsme(dfCurated, param, target_col="wage_increase", features=[c for c in dfCurated.columns if c != "wage_increase"])
    param["rsme"] = rsme
    result.append(param)

In [53]:
result_df = pd.DataFrame(result)
result_df.head()

Unnamed: 0,criterion,max_depth,n_estimators,random_state,rsme
0,mae,9.0,27,48,0.161175
1,mae,3.0,93,17,0.094977
2,mae,3.0,79,15,0.111785
3,mse,,51,63,0.108328
4,mae,6.0,36,35,0.125122


In [None]:
output_path = os.path.join(DATA_CLEAN_PATH, "param_random_forest.csv")
result.to_csv(output_path)