In [1]:
import numpy as np
import pandas as pd

#SCI-kit learn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Loading data

In [2]:
to_predict = pd.read_csv('''/home/david/Documents/learning_repositories/dataptmad-0420-classes/week_14/ml_guided_lesson/data/diamonds_predict.csv''')
to_predict

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19
...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45


In [3]:
train = pd.read_csv('''/home/david/Documents//learning_repositories/dataptmad-0420-classes/week_14/ml_guided_lesson/data/diamonds_train.csv''')
train

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.00
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...
40450,1.34,Ideal,G,VS1,62.7,57.0,10070,7.10,7.04,4.43
40451,2.02,Good,F,SI2,57.1,60.0,12615,8.31,8.25,4.73
40452,1.01,Ideal,H,SI1,62.7,56.0,5457,6.37,6.42,4.01
40453,0.33,Ideal,J,VS1,61.9,54.3,456,4.45,4.47,2.76


## Feature engineering

In [4]:
target = 'price'

cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

In [5]:
for category in cat_features:
    train[f'{category}_mean_price'] = train.groupby(f'{category}').price.transform('mean')
    to_predict[f'{category}_mean_price'] = train.groupby(f'{category}').price.transform('mean')

train['volume'] = train['x'] * train['y'] * train['z']
to_predict['volume'] = to_predict['x'] * to_predict['y'] * to_predict['z']


In [6]:
train

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_mean_price,color_mean_price,clarity_mean_price,volume
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,4617.322612,5346.234112,3913.590182,197.096725
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,3994.444420,4476.469014,3913.590182,52.395750
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,4333.271980,4023.214902,3796.813551,113.436890
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.00,3880.611794,3134.943157,3999.856908,66.268800
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,3436.112577,4023.214902,3999.856908,168.429975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.34,Ideal,G,VS1,62.7,57.0,10070,7.10,7.04,4.43,3436.112577,4023.214902,3796.813551,221.429120
40451,2.02,Good,F,SI2,57.1,60.0,12615,8.31,8.25,4.73,3880.611794,3677.355720,5101.044307,324.276975
40452,1.01,Ideal,H,SI1,62.7,56.0,5457,6.37,6.42,4.01,3436.112577,4476.469014,3999.856908,163.990554
40453,0.33,Ideal,J,VS1,61.9,54.3,456,4.45,4.47,2.76,3436.112577,5346.234112,3796.813551,54.900540


## Data pre-processing

In [8]:
train.select_dtypes(include=np.number).columns

Index(['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'cut_mean_price',
       'color_mean_price', 'clarity_mean_price', 'volume'],
      dtype='object')

In [9]:
extended_num_features = ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_mean_price',
                'color_mean_price', 'clarity_mean_price']

In [10]:
# Train set as num features
cat_df = pd.get_dummies(train[cat_features])
num_df = train[extended_num_features]
X_train_df = pd.concat([num_df, cat_df], axis=1) # train_df

# To predict set as num features
cat_df = pd.get_dummies(to_predict[cat_features])
num_df = to_predict[extended_num_features]
X_predict = pd.concat([num_df, cat_df], axis=1) # to_predict_df

y = train[target].values

In [11]:
X_train_df

Unnamed: 0,carat,depth,table,x,y,z,cut_mean_price,color_mean_price,clarity_mean_price,cut_Fair,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,1.21,62.4,58.0,6.83,6.79,4.25,4617.322612,5346.234112,3913.590182,0,...,0,1,0,0,0,0,0,1,0,0
1,0.32,63.0,57.0,4.35,4.38,2.75,3994.444420,4476.469014,3913.590182,0,...,0,0,0,0,0,0,0,1,0,0
2,0.71,65.5,55.0,5.62,5.53,3.65,4333.271980,4023.214902,3796.813551,1,...,0,0,0,0,0,0,1,0,0,0
3,0.41,63.8,56.0,4.68,4.72,3.00,3880.611794,3134.943157,3999.856908,0,...,0,0,0,0,1,0,0,0,0,0
4,1.02,60.5,59.0,6.55,6.51,3.95,3436.112577,4023.214902,3999.856908,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.34,62.7,57.0,7.10,7.04,4.43,3436.112577,4023.214902,3796.813551,0,...,0,0,0,0,0,0,1,0,0,0
40451,2.02,57.1,60.0,8.31,8.25,4.73,3880.611794,3677.355720,5101.044307,0,...,0,0,0,0,0,1,0,0,0,0
40452,1.01,62.7,56.0,6.37,6.42,4.01,3436.112577,4476.469014,3999.856908,0,...,0,0,0,0,1,0,0,0,0,0
40453,0.33,61.9,54.3,4.45,4.47,2.76,3436.112577,5346.234112,3796.813551,0,...,0,1,0,0,0,0,1,0,0,0


In [12]:
X_predict

Unnamed: 0,carat,depth,table,x,y,z,cut_mean_price,color_mean_price,clarity_mean_price,cut_Fair,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.79,62.7,60.0,5.82,5.89,3.67,4617.322612,5346.234112,3913.590182,0,...,0,0,0,0,1,0,0,0,0,0
1,1.20,61.0,57.0,6.81,6.89,4.18,3994.444420,4476.469014,3913.590182,0,...,0,1,0,0,0,0,1,0,0,0
2,1.57,62.2,61.0,7.38,7.32,4.57,4333.271980,4023.214902,3796.813551,0,...,0,0,0,0,1,0,0,0,0,0
3,0.90,63.8,54.0,6.09,6.13,3.90,3880.611794,3134.943157,3999.856908,0,...,0,0,0,0,1,0,0,0,0,0
4,0.50,62.9,58.0,5.05,5.09,3.19,3436.112577,4023.214902,3999.856908,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.57,61.9,56.0,5.35,5.32,3.30,4617.322612,3134.943157,5101.044307,0,...,0,0,0,0,1,0,0,0,0,0
13481,0.71,62.2,55.0,5.71,5.73,3.56,4617.322612,4023.214902,3999.856908,0,...,1,0,0,0,0,0,0,1,0,0
13482,0.70,61.6,55.0,5.75,5.71,3.53,3880.611794,4476.469014,3796.813551,0,...,0,0,0,0,0,0,1,0,0,0
13483,0.70,58.8,57.0,5.85,5.89,3.45,3436.112577,5090.868800,3240.978942,0,...,0,0,0,0,0,1,0,0,0,0


## Train-test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_train_df, y)
X_train

Unnamed: 0,carat,depth,table,x,y,z,cut_mean_price,color_mean_price,clarity_mean_price,cut_Fair,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
20148,1.16,61.8,58.0,6.72,6.78,4.17,3994.444420,3134.943157,3999.856908,0,...,0,0,0,0,1,0,0,0,0,0
31153,0.23,62.8,54.0,3.92,3.94,2.47,3994.444420,3677.355720,2559.498558,0,...,0,0,0,0,0,0,0,0,1,0
40366,0.59,62.5,55.0,5.40,5.42,3.38,3436.112577,3134.943157,3999.856908,0,...,0,0,0,0,1,0,0,0,0,0
17989,0.77,62.5,56.0,5.87,5.91,3.68,3436.112577,3088.342526,3999.856908,0,...,0,0,0,0,1,0,0,0,0,0
30086,1.02,62.5,57.0,6.44,6.37,4.00,3436.112577,4023.214902,2797.897513,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34510,0.41,62.2,55.0,4.79,4.76,2.97,3436.112577,5346.234112,3796.813551,0,...,0,1,0,0,0,0,1,0,0,0
31541,0.50,61.9,62.0,5.03,5.09,3.13,3994.444420,3134.943157,3999.856908,0,...,0,0,0,0,1,0,0,0,0,0
17590,0.74,61.6,59.0,5.82,5.77,3.57,4617.322612,5346.234112,3999.856908,0,...,0,1,0,0,1,0,0,0,0,0
965,1.00,63.2,59.0,6.30,6.35,4.00,3880.611794,3677.355720,3913.590182,0,...,0,0,0,0,0,0,0,1,0,0


## Scaler

In [14]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model

In [15]:
model = LinearRegression()

In [16]:
model.fit(X_train_scaled, y_train)

LinearRegression()

## Validation & Metrics

In [17]:
y_test_pred = model.predict(X_test_scaled).clip(300, 18000)

In [18]:
rmse = mean_squared_error(y_true=y_test, y_pred=y_test_pred, squared=False)
rmse

1045.7439255965567

In [19]:
pd.DataFrame({'pred': y_test_pred, 'real': y_test})

Unnamed: 0,pred,real
0,3714.848396,3888
1,559.345018,945
2,3469.625118,3358
3,9572.105776,10362
4,5581.670231,5231
...,...,...
10109,1717.076023,2268
10110,10548.988654,10534
10111,1852.075963,1417
10112,1463.106118,1436


## Last training

In [20]:
model.fit(scaler.transform(X_train_df), y)

LinearRegression()

In [21]:
X_train_df.shape

(40455, 29)

## Submission

In [22]:
X_predict_scaled = scaler.transform(X_predict)
X_predict_scaled.shape

(13485, 29)

In [23]:
y_predict = model.predict(X_predict_scaled)

In [24]:
submission = pd.DataFrame({'id': to_predict.id,
                          'price': y_predict},
                         )
submission

Unnamed: 0,id,price
0,0,2574.810538
1,1,6670.611846
2,2,10696.102396
3,3,5023.210939
4,4,1669.441127
...,...,...
13480,13480,417.060454
13481,13481,2984.528055
13482,13482,3329.850323
13483,13483,3437.009288


In [25]:
a

NameError: name 'a' is not defined

In [None]:
submission.to_csv('./submissions/LRscaled.csv', index=False)