In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, mean_squared_log_error

## Load data into DataFrame

In [2]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [3]:
X = df_train.iloc[:,2:-1]
y = df_train.iloc[:,-1]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, shuffle=True)

## Reviewing the data for any changes

In [None]:
X_train.head()

In [None]:
X_train.describe()

In [None]:
X_train['Transport'].nunique()

In [None]:
X_train['Customer Information'].nunique()

In [None]:
X_train['Customer Location'].nunique()

## Prepare the data

- Convert categorical data into numerical
- set all numerical columns to have symmetry about mean
- replace some columns with only import values

In [None]:
print(X_train.shape[0], X_train.shape[1])

In [18]:
X_train.iloc[0,:]

Artist Reputation                 0.47
Height                            42.0
Width                             33.0
Weight                      29282087.0
Material                         Stone
Price Of Sculpture           105746.47
Base Shipping Price              50.94
International                       No
Express Shipment                    No
Installation Included               No
Transport                     Roadways
Fragile                             No
Customer Information     Working Class
Remote Location                     No
Customer Location                   AP
Name: 1314, dtype: object

In [5]:
def cust_location(df):
    p = re.compile(r".*([A-Z]{2}).*$")
    rows, cols = df.shape
    for i in range(rows):
        df.iat[i, cols-1] = p.match(df.iat[i, cols-1]).group(1)
    return df

## Build the model

In [6]:
X_train = X_train.drop(columns=['Scheduled Date', 'Delivery Date'])

In [7]:
X_train = cust_location(X_train)

In [8]:
X_train.head()

Unnamed: 0,Artist Reputation,Height,Width,Weight,Material,Price Of Sculpture,Base Shipping Price,International,Express Shipment,Installation Included,Transport,Fragile,Customer Information,Remote Location,Customer Location
1314,0.47,42.0,33.0,29282087.0,Stone,105746.47,50.94,No,No,No,Roadways,No,Working Class,No,AP
1254,,26.0,11.0,539.0,,3.19,24.84,No,Yes,No,Roadways,No,Wealthy,No,NM
4484,0.73,,3.0,35.0,,3.33,16.1,No,No,Yes,Roadways,Yes,Working Class,No,IL
4911,0.94,17.0,6.0,217.0,Aluminium,4.98,24.17,Yes,No,Yes,Waterways,No,Working Class,No,SD
4673,0.1,10.0,4.0,146.0,Aluminium,6.67,22.69,Yes,No,No,Airways,No,Working Class,No,CO


In [36]:
encoder.fit_transform(X_train['Material'])
# encoder.fit(X_train['International'])

array([5, 7, 7, ..., 6, 3, 6])

In [37]:
encoder.classes_

array(['Aluminium', 'Brass', 'Bronze', 'Clay', 'Marble', 'Stone', 'Wood',
       nan], dtype=object)

In [27]:
imputer = SimpleImputer(strategy="most_frequent")
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2)
lin_reg = LinearRegression()
encoder = LabelEncoder()

col_transform = ColumnTransformer(
    remainder='passthrough',
    transformers=[
        ('scaler1', scaler, [1,2,3,5,6]),
        ('encoder', encoder, [4])
    ]
)

pipe = Pipeline([
    ('imputer', imputer),
    ('transformer', col_transform),
    ('polyfeatures', poly),
    ('regressor', lin_reg)
])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.predict(X_test)