In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
pd.options.display.max_columns = None
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [5]:
df_train = pd.read_csv('soan95train.csv', index_col=None)
df_test = pd.read_csv('soan95test.csv', index_col=None)

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1826 entries, 0 to 1825
Data columns (total 3 columns):
date     1826 non-null object
basin    1826 non-null int64
rain     1826 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 42.9+ KB


In [12]:
X_train = df_train['rain']
y_train = df_train['basin']
X_test = df_test['rain']
y_test = df_test['rain']

In [19]:
regr = RandomForestRegressor(max_depth=30, random_state=0)

In [24]:
param_grid = {"max_depth": [3, None],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["mse"]}

In [25]:
grid_search = GridSearchCV(regr, param_grid=param_grid)

In [26]:
grid_search.fit(np.array(X_train).reshape(-1, 1), y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=30, max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=0,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'bootstrap': [True, False], 'criterion': ['mse'],
                         '

In [27]:
grid_search.best_params_

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [28]:
best_regr = RandomForestRegressor(max_depth=3, bootstrap=True, min_samples_leaf=1, min_samples_split=2, n_estimators=30, random_state=0)

In [29]:
best_regr.fit(np.array(X_train).reshape(-1, 1), y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=30,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [30]:
result = best_regr.predict(np.array(X_test).reshape(-1, 1))

In [31]:
mean_absolute_error(y_test, result)

27.597107511020102

In [32]:
pd.DataFrame(result).to_csv('random_forest.csv', index=False)