## Imports

In [1]:
import pandas as pd
import numpy as np


#visualization
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBaclend.figure_format = 'svg'


#preprocessing tools

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer #equal to get dummies
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


## LODING DATA


In [2]:
diamonds = pd.read_csv('./data/diamonds_train.csv.zip')
diamonds_predict = pd.read_csv('./data/diamonds_predict.csv')

In [3]:
diamonds.head().T

Unnamed: 0,0,1,2,3,4
carat,1.21,0.32,0.71,0.41,1.02
cut,Premium,Very Good,Fair,Good,Ideal
color,J,H,G,D,G
clarity,VS2,VS2,VS1,SI1,SI1
depth,62.4,63,65.5,63.8,60.5
table,58,57,55,56,59
price,4268,505,2686,738,4882
x,6.83,4.35,5.62,4.68,6.55
y,6.79,4.38,5.53,4.72,6.51
z,4.25,2.75,3.65,3,3.95


## Pipeline 

In [4]:
TARGET = 'price' #to predict


NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS


In [5]:
#transformer numerical values

#I dont get the sintax of this step, 

numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
               ('scaler', StandardScaler())])


In [6]:
#transformer cathegorical values

categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
               ('onehot', OneHotEncoder(handle_unknown='ignore'))])#handle unkown dont compute unkown values

In [7]:
#Joining the column transformers

preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                               ('cat', categorical_transformer, CAT_FEATS)])

## Getting the full processor

In [9]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['carat', 'depth', 'table', 'x', 'y', 'z']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['cut', 'color', 'clarity'])])

In [32]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [13]:
print(diamonds_train.shape)
print(diamonds_test.shape)

(30341, 10)
(10114, 10)


## RIDGE Model

In [14]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

model = Pipeline(steps=[('preprocessor',preprocessor),
                       ('regressor', RandomForestRegressor())])

In [15]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

## Checking model performance on test and train data

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
y_test = model.predict(diamonds_test[FEATS])
y_train = model.predict(diamonds_train[FEATS])

In [18]:
print(f"Test error: {mean_squared_error(y_pred=y_test,y_true=diamonds_test[TARGET], squared=False)}")
print(f"Train error: {mean_squared_error(y_pred=y_train,y_true=diamonds_train[TARGET], squared=False)}")

Test error: 568.3649054789709
Train error: 210.09002798259442


## Cross validation

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
scores = cross_val_score(model,
                        diamonds[FEATS],
                        diamonds[TARGET],
                        scoring='neg_root_mean_squared_error',
                        cv=10, n_jobs=-1)

In [21]:
np.mean(-scores)

554.3958316564633

## Grid search

In [22]:
from sklearn.model_selection import RandomizedSearchCV # This step I dont understand what it is and what it is for

In [23]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   54.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   59.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 12.2min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer()),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
                               

In [24]:
grid_search.best_params_

{'regressor__n_estimators': 512,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'median'}

In [25]:
grid_search.best_score_

-557.596961726336

## Submission

In [26]:
y_pred = grid_search.predict(diamonds_predict[FEATS])

In [27]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'],
                             'price': y_pred})

In [28]:
submission_df.head()

Unnamed: 0,id,price
0,0,2972.4737
1,1,5381.954537
2,2,9266.221156
3,3,4147.112523
4,4,1700.652627


In [29]:
submission_df.describe()

Unnamed: 0,id,price
count,13485.0,13485.0
mean,6742.0,3953.452337
std,3892.928525,3948.143399
min,0.0,373.299122
25%,3371.0,945.611874
50%,6742.0,2464.318539
75%,10113.0,5329.381762
max,13484.0,18151.829053


In [30]:
submission_df.price.clip(370,19000, inplace=True)

In [31]:
submission_df.to_csv('./submissions/submission_pipeline_ridge_mean.csv', index=False)