In [24]:
import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

In [29]:
# prepare the data for regression
def create_dataset(data):
    dataX, dataY = [],[]
    for i in range(len(data)):
        dataX.append(data[i][:-1])
        dataY.append([data[i][-1]])
        
    return np.array(dataX), np.array(dataY)

# Mean Absolute Percentage Error
def mean_absolute_percentage_error(data_true, data_predict):
    error = 0
    count = 0
    data_true_de = sum(data_true)/len(data_true)
    
    for i in range(len(data_true)):
        error += np.abs((data_true[i]-data_predict[i])/data_true_de)

    return((error/len(data_true))*100)

def random_forest(X,Y):
    
    cvscores = []
    # k-cross validation
    k_fold = KFold(n_splits=3, shuffle=True, random_state=0)
        
    for train_index, test_index in k_fold.split(X, Y):
        
        X_train = X[train_index]
        y_train = Y[train_index]
        
        X_test = X[test_index]
        y_test = Y[test_index]        
        
        rf_model = RandomForestRegressor(n_estimators=10,max_depth=10, random_state=0).fit(X_train, y_train)
        rf_prediction = rf_model.predict(X_test)
        score = mean_absolute_percentage_error(y_test, rf_prediction)
        cvscores.append(score)
        
    return np.mean(cvscores)

def train_model(dataset):
    
    result = []
    
    for col in dataset.columns:
            
        # prepare data
        X = dataset.loc[:, dataset.columns!=col].values
        Y = dataset.loc[:, dataset.columns==col].values
        
        # normalize the data
        scaler = MinMaxScaler(feature_range=(0,1))
        X = np.array(scaler.fit_transform(X))
        Y = np.array(scaler.fit_transform(Y))
        
        res = random_forest(X, Y)
        result.append(res)
        
    return result

In [30]:
# Load data
lower_data = pd.read_csv('lower_data_clean.csv')

# Only use the data from sunny day
lower_data = lower_data[lower_data['weather']=='sunny']
lower_data = lower_data.drop(['date_time','weather'],axis=1)

# Train the model
train_model(lower_data)

[6.75301205902971,
 9.998440955253999,
 7.646446445367009,
 6.584643020507183,
 6.0584711897121295,
 6.981178936238133,
 4.87299028012874,
 5.978809765957696,
 8.84871424754048,
 7.6259806943727915]