In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [2]:
train = pd.read_csv('data/diamonds_train.csv')
predict = pd.read_csv('data/diamonds_test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [3]:
TARGET = 'price'

CAT_FEATURES = ['cut', 'color', 'clarity']
NUM_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z']

FEATURES = NUM_FEATURES + CAT_FEATURES

In [4]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [5]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [6]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATURES),
                                ('cat', categorical_transformer, CAT_FEATURES)])

In [7]:
pd.DataFrame(data=preprocessor.fit_transform(train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.140380,0.661559,-0.199745,1.218927,1.140014,1.280887,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
40451,2.570338,-3.249854,1.143433,2.295019,2.195276,1.711271,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
40452,0.446430,0.661559,-0.647472,0.569714,0.599302,0.678351,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
40453,-0.983529,0.102785,-1.408606,-1.137805,-1.101325,-1.114913,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
diamonds_train, diamonds_test = train_test_split(train)

In [10]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

In [11]:
model.fit(diamonds_train[FEATURES], diamonds_train[TARGET]);

In [12]:
from sklearn.metrics import mean_squared_error

In [13]:
y_test = model.predict(diamonds_test[FEATURES])
y_train = model.predict(diamonds_train[FEATURES])

In [14]:
from sklearn.model_selection import cross_val_score

In [15]:
scores = cross_val_score(model, 
                         train[FEATURES], 
                         train[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [16]:
from sklearn.model_selection import RandomizedSearchCV

In [17]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(train[FEATURES], train[TARGET]);

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [18]:
y_pred = grid_search.predict(predict[FEATURES])

In [19]:
submission = pd.DataFrame({'id': predict['id'], 'price': y_pred})

In [20]:
submission.price.clip(0, 20000, inplace=True)

In [21]:
submission

Unnamed: 0,id,price
0,0,2974.380004
1,1,5369.444203
2,2,9104.016907
3,3,4104.870986
4,4,1705.985291
...,...,...
13480,13480,1890.513447
13481,13481,2458.620514
13482,13482,3038.628816
13483,13483,2184.869711


In [22]:
submission.to_csv('submissions/submission_random_forest.csv', index=False)