In [1]:
from codes import open_file
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
d_train = open_file("../Output/d_train.csv")

In [3]:
print(d_train.shape)
d_train.head()

(39329, 9)


Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,y
0,0,-0.672434,0.374798,0.383579,-0.071006,0.38169,-0.330279,-0.020314,6134
1,1,-0.493746,-0.318373,0.506668,-0.187858,-0.253279,0.433971,0.33222,532
2,2,-0.831077,-0.3855,-0.207628,0.140686,0.022722,0.16113,0.267057,1103
3,3,-0.499659,-0.335137,-0.147608,-0.05277,0.283661,-0.245454,0.687561,600
4,4,-0.574673,0.238643,0.432647,-0.060695,-0.605385,-0.085444,-0.219411,4997


In [4]:
d_test = open_file("../Output/d_test.csv")

In [5]:
print(d_test.shape)
d_test.head()

(13449, 8)


Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6
0,0,-0.573731,-0.34722,-0.398643,-0.215752,0.324196,-0.266921,-0.410432
1,1,-0.477602,-0.26615,0.06842,-0.055877,0.269917,0.419759,0.666487
2,2,-0.571817,0.525832,-0.328951,0.081276,-0.267636,-0.458297,-0.006762
3,3,-0.724941,-0.430222,0.017053,-0.08484,0.409823,-0.33742,-0.008575
4,4,-0.419889,0.355977,0.378748,0.277392,-0.413581,-0.33663,0.438416


# Train

In [6]:
X = d_train.drop(columns=['y'])
y = d_train['y']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

# Models

In [8]:
models = {
    "RandomForest": RandomForestRegressor(n_estimators=1000, warm_start=False, min_samples_leaf=2, min_samples_split=4),
    "LinearRegression": LinearRegression(),
    "KNeighbors": KNeighborsRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls'),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "SVR": SVR(C=1.0, epsilon=0.2)
}

# Check Models

In [9]:
for name,model  in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)

Training RandomForest...
Training LinearRegression...
Training KNeighbors...
Training GradientBoostingRegressor...
Training DecisionTreeRegressor...
Training SVR...


In [26]:
for name, model in models.items():
    y_pred_t = model.predict(X_test)
    RMSE = mean_squared_error(y_test, y_pred_t)
    print(f'{name} RMSE:', round(RMSE/100000, 3))

RandomForest RMSE: 161.424
LinearRegression RMSE: 155.21
KNeighbors RMSE: 187.085
GradientBoostingRegressor RMSE: 155.2
DecisionTreeRegressor RMSE: 337.243
SVR RMSE: 178.615


In [11]:
for name, model in models.items():
    print(f'{name}:')
    print(cross_val_score(model, X_train, y_train))

RandomForest:
[-0.04055939 -0.03843596 -0.03698147 -0.04393291 -0.03971352]
LinearRegression:
[-5.23167283e-05 -9.08841023e-05 -1.51155390e-03 -7.51651376e-04
 -5.69145746e-05]
KNeighbors:
[-0.18773663 -0.19734182 -0.19022498 -0.17826057 -0.18893673]
GradientBoostingRegressor:
[-0.00093386 -0.00024981 -0.0012076  -0.00249329  0.00055878]
DecisionTreeRegressor:
[-1.09256412 -1.06453432 -1.16169031 -0.96975549 -1.15146309]
SVR:
[-0.13827458 -0.14808838 -0.15409647 -0.14484365 -0.15263972]


# Select the best models

The best models are the ones with the lowest RMSE:
- GradientBoostingRegressor
- LinearRegression
- RandomForest

# Convert the top models in CSV

In [None]:
result = pd.DataFrame({"id":range(len(y_pred)),"price":y_pred})

In [None]:
print(result.shape)
result.head()

In [None]:
result.to_csv("../Output/submission_1.csv", index=False)