In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [2]:
train_path = os.path.join('..', 'data', 'train.csv')
test_path = os.path.join('..', 'data', 'test.csv')
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print(train.shape)
print(test.shape)

(6500, 20)
(3500, 19)


In [3]:
train.head()

Unnamed: 0,Customer Id,Artist Name,Artist Reputation,Height,Width,Weight,Material,Price Of Sculpture,Base Shipping Price,International,Express Shipment,Installation Included,Transport,Fragile,Customer Information,Remote Location,Scheduled Date,Delivery Date,Customer Location,Cost
0,fffe3900350033003300,Billy Jenkins,0.26,17.0,6.0,4128.0,Brass,13.91,16.27,Yes,Yes,No,Airways,No,Working Class,No,06/07/15,06/03/15,"New Michelle, OH 50777",-283.29
1,fffe3800330031003900,Jean Bryant,0.28,3.0,3.0,61.0,Brass,6.83,15.0,No,No,No,Roadways,No,Working Class,No,03/06/17,03/05/17,"New Michaelport, WY 12072",-159.96
2,fffe3600370035003100,Laura Miller,0.07,8.0,5.0,237.0,Clay,4.96,21.18,No,No,No,Roadways,Yes,Working Class,Yes,03/09/15,03/08/15,"Bowmanshire, WA 19241",-154.29
3,fffe350031003300,Robert Chaires,0.12,9.0,,,Aluminium,5.81,16.31,No,No,No,,No,Wealthy,Yes,05/24/15,05/20/15,"East Robyn, KY 86375",-161.16
4,fffe3900320038003400,Rosalyn Krol,0.15,17.0,6.0,324.0,Aluminium,3.18,11.94,Yes,Yes,Yes,Airways,No,Working Class,No,12/18/16,12/14/16,"Aprilside, PA 52793",-159.23


In [4]:
def manipulation(df):
    df['State']= df['Customer Location'].str.split(" ").str[-2]
manipulation(train)

df = train.drop(columns=[col for col in train.columns if col not in 
                      ['Price Of Sculpture', 'State', 'Artist Reputation',
                       'Base Shipping Price', 'Weight']])
   

def summary(df):
    print(f"Dataset Shape: {df.shape}")
    summary= pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary= summary.reset_index()
    summary['Feature Name'] = summary['index']
    summary = summary[['Feature Name', 'dtypes']]
    summary['missing'] = df.isnull().sum().values
    summary['Uniques'] = df.nunique().values
    return summary

summary(df)

Dataset Shape: (6500, 5)


Unnamed: 0,Feature Name,dtypes,missing,Uniques
0,Artist Reputation,float64,750,101
1,Weight,float64,587,4410
2,Price Of Sculpture,float64,0,3424
3,Base Shipping Price,float64,0,3732
4,State,object,0,54


In [5]:
y = train['Cost'].abs()
X = train.drop(['Cost'], axis =1)

In [6]:
num_df = X.select_dtypes(exclude=object)
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
num_df = pd.DataFrame(imp_median.fit_transform(num_df), columns=num_df.columns)
num_scaled = pd.DataFrame(MinMaxScaler().fit_transform(num_df), columns = num_df.columns)
print(num_scaled.shape)


# Filtering categorical data
cat_df = X.select_dtypes(include=object)
# Filling missing values by most frequent value for categorical columns
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cat_new = pd.DataFrame(imp_mode.fit_transform(cat_df), columns=cat_df.columns)
cat_df = pd.get_dummies(cat_new)
print(cat_df.shape)

(6500, 6)
(6500, 22849)


## Modeling

In [7]:
X = pd.concat([num_scaled, cat_df], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [8]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('MAE Score: ', metrics.mean_absolute_error(y_test, y_pred))

MAE Score:  9160.599193435895


## Pipeline

In [15]:
train = pd.read_csv(train_path)
train = manipulation(train)
train = train.drop(columns=[col for col in train.columns if col not in 
                      ['Price Of Sculpture', 'State', 'Artist Reputation',
                       'Base Shipping Price', 'Weight', 'Cost']])
y = train['Cost'].abs()
X = train.drop(['Cost'], axis =1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

num_df = X_train.select_dtypes(include=np.number).columns
# Numerical Preprocessing
num_pre = Pipeline(steps =
                   [("Num Imputer", SimpleImputer(missing_values=np.nan, strategy='median')),
                    ("Scaler", MinMaxScaler())])


cat_df = X_train.select_dtypes(exclude=np.number).columns
# Categorical Preprocessing
cat_pre = Pipeline(steps =
                   [("Cat Imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                    ("onehot", OneHotEncoder(handle_unknown='ignore'))])
# Joining Categorical and Numerical Columns
preprocessor = ColumnTransformer(transformers=[
                                ('num', num_pre, num_df),
                                ('cat', cat_pre, cat_df)])

#Modeling Pipeline
pipe = Pipeline(steps = 
                [('preprocessor', preprocessor),
                 ('RandomForest', RandomForestRegressor())])

pipe.fit(X_train, y_train)

Artist Reputation :  102
Height :  66
Width :  41
Weight :  4411
Material :  8
Price Of Sculpture :  3424
Base Shipping Price :  3732
International :  2
Express Shipment :  2
Installation Included :  2
Transport :  4
Fragile :  2
Customer Information :  2
Remote Location :  3
Cost :  6356
State :  54
Days Interval :  9


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('Num '
                                                                   'Imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('Scaler',
                                                                   MinMaxScaler())]),
                                                  Index(['Artist Reputation', 'Weight', 'Price Of Sculpture',
       'Base Shipping Price'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('Cat '
                                                                   'Imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
          

In [16]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pickle

# Read the training data
train = pd.read_csv(train_path)

# Perform manipulation on the data
def manipulation(df):
    df['State']= df['Customer Location'].str.split(" ").str[-2]
    return df

train = manipulation(train)

# Drop unwanted columns
train = train.drop(columns=[col for col in train.columns if col not in 
                      ['Price Of Sculpture', 'State', 'Artist Reputation',
                       'Base Shipping Price', 'Weight', 'Cost']])

# Separate the target variable and features
y = train['Cost'].abs()
X = train.drop(['Cost'], axis =1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Define the numerical columns for preprocessing
num_df = X_train.select_dtypes(include=np.number).columns

# Define the numerical preprocessing steps
num_pre = Pipeline(steps=[
                   ("Num Imputer", SimpleImputer(missing_values=np.nan, strategy='median')),
                   ("Scaler", MinMaxScaler())
                 ])

# Define the categorical columns for preprocessing
cat_df = X_train.select_dtypes(exclude=np.number).columns

# Define the categorical preprocessing steps
cat_pre = Pipeline(steps=[
                   ("Cat Imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                   ("onehot", OneHotEncoder(handle_unknown='ignore'))
                 ])

# Combine the preprocessing steps for both numerical and categorical columns
preprocessor = ColumnTransformer(transformers=[
                  ('num', num_pre, num_df),
                  ('cat', cat_pre, cat_df)
                ])

# Define the modeling pipeline
pipe = Pipeline(steps=[
          ('preprocessor', preprocessor),
          ('RandomForest', RandomForestRegressor())
        ])

# Fit the model on the training data
pipe.fit(X_train, y_train)

# Save the model using pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(pipe, f)