## Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Load preprocessed data

In [None]:
train = pd.read_csv('./preprocessed/train.csv', index_col='id')
test = pd.read_csv('./preprocessed/test.csv', index_col='id')

data = pd.concat([train, test])

## Train X-y split

In [None]:
y_train = train['target']
train.drop('target', axis=1, inplace=True)

## Model selection

In [None]:
# import models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
# import instruments for model evaluation
from sklearn.metrics import mean_squared_error as mse 
from sklearn.model_selection import KFold

In [None]:
# define error function
def error(Y_real, Y_pred):
    return mse(Y_real, Y_pred)

In [None]:
# list of models
models = [
    XGBRegressor(),
    RandomForestRegressor()
]

# K-Fold splitter
kfold = KFold(n_splits=10)

# Model evaluation DataFrame
log_df = pd.DataFrame(columns=['Model', 'Error'])
log_dict = {}

for train_ind, test_ind in kfold.split(train, y_train):
    X_train, Y_train = train.iloc[train_ind], y_train.iloc[train_ind]
    X_test, Y_test = train.iloc[test_ind], y_train.iloc[test_ind]
    
    for model in models:
        name = model.__class__.__name__
        
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        
        err = error(Y_test, Y_pred)
        
        if name in log_dict:
            log_dict[name] += err
        else:
            log_dict[name] = err

for model in log_dict:
    row = {'Model': model, 'Error': log_dict[model] / 10}
    log_df = log_df.append(row, ignore_index=True)

log_df

## Prediction of the best model

In [None]:
best = XGBRegressor()
best.fit(train, y_train)

predictions = best.predict(test)

In [None]:
plt.plot(train.columns, best.feature_importances_)

## Export results

In [None]:
submission_df = pd.read_csv('./data/sample_submission.csv', index_col='Id')
submission_df['target'] = predictions

submission_df.to_csv('./result/result.csv')