In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import os
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import pickle

In [2]:
train_path = os.path.join('..', 'data', 'train.csv')
test_path = os.path.join('..', 'data', 'test.csv')
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print(train.shape)
print(test.shape)

(6500, 20)
(3500, 19)


In [3]:
train.head()

Unnamed: 0,Customer Id,Artist Name,Artist Reputation,Height,Width,Weight,Material,Price Of Sculpture,Base Shipping Price,International,Express Shipment,Installation Included,Transport,Fragile,Customer Information,Remote Location,Scheduled Date,Delivery Date,Customer Location,Cost
0,fffe3900350033003300,Billy Jenkins,0.26,17.0,6.0,4128.0,Brass,13.91,16.27,Yes,Yes,No,Airways,No,Working Class,No,06/07/15,06/03/15,"New Michelle, OH 50777",-283.29
1,fffe3800330031003900,Jean Bryant,0.28,3.0,3.0,61.0,Brass,6.83,15.0,No,No,No,Roadways,No,Working Class,No,03/06/17,03/05/17,"New Michaelport, WY 12072",-159.96
2,fffe3600370035003100,Laura Miller,0.07,8.0,5.0,237.0,Clay,4.96,21.18,No,No,No,Roadways,Yes,Working Class,Yes,03/09/15,03/08/15,"Bowmanshire, WA 19241",-154.29
3,fffe350031003300,Robert Chaires,0.12,9.0,,,Aluminium,5.81,16.31,No,No,No,,No,Wealthy,Yes,05/24/15,05/20/15,"East Robyn, KY 86375",-161.16
4,fffe3900320038003400,Rosalyn Krol,0.15,17.0,6.0,324.0,Aluminium,3.18,11.94,Yes,Yes,Yes,Airways,No,Working Class,No,12/18/16,12/14/16,"Aprilside, PA 52793",-159.23


In [4]:
def manipulation(df):
    df['State']= df['Customer Location'].str.split(" ").str[-2]
manipulation(train)

df = train.drop(columns=[col for col in train.columns if col not in 
                      ['Price Of Sculpture', 'Artist Reputation',
                       'Base Shipping Price', 'Weight', 'Width', 'Height']])
   

def summary(df):
    print(f"Dataset Shape: {df.shape}")
    summary= pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary= summary.reset_index()
    summary['Feature Name'] = summary['index']
    summary = summary[['Feature Name', 'dtypes']]
    summary['missing'] = df.isnull().sum().values
    summary['Uniques'] = df.nunique().values
    return summary

summary(df)

Dataset Shape: (6500, 6)


Unnamed: 0,Feature Name,dtypes,missing,Uniques
0,Artist Reputation,float64,750,101
1,Height,float64,375,65
2,Width,float64,584,40
3,Weight,float64,587,4410
4,Price Of Sculpture,float64,0,3424
5,Base Shipping Price,float64,0,3732


In [5]:
y = train['Cost'].abs()
X = df

In [6]:
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X = pd.DataFrame(imp_median.fit_transform(X), columns=X.columns)
summary(X)

Dataset Shape: (6500, 6)


Unnamed: 0,Feature Name,dtypes,missing,Uniques
0,Artist Reputation,float64,0,101
1,Height,float64,0,65
2,Width,float64,0,40
3,Weight,float64,0,4410
4,Price Of Sculpture,float64,0,3424
5,Base Shipping Price,float64,0,3732


## Modeling

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [8]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('MAE Score: ', metrics.mean_absolute_error(y_test, y_pred))

MAE Score:  16934.74145102564


In [10]:
# Save the model using pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(rf, f)