<u>Importing required modules</u>

In [16]:
#Data preprocessing
import numpy as np
import pandas as pd
import re
from imblearn.over_sampling import SMOTE 

#Data Transformation
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.svm import SVC,SVR
from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier,VotingClassifier
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.model_selection import train_test_split,StratifiedKFold,KFold
from sklearn.preprocessing import LabelEncoder,StandardScaler,RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,mean_squared_error

#Data visualisation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Handling warnings
import warnings
warnings.filterwarnings("ignore")

import os
import pickle

<u>Loading data</u>

Constraints;
- Your dataset must be clean and targets should be present at the last column
- Your datasets must be in the Samples folder

In [4]:
req_data = input("Enter desired dataset : ")
if req_data not in os.listdir('Samples/'):
    print("No such dataset")
else:
    if re.findall("\.[a-z]+",req_data)[0][1:]=="csv":
        data = pd.read_csv(f'Samples/{req_data}')
    elif re.findall("\.[a-z]+",req_data)[0][1:]=="xlsx":
        data = pd.read_excel(f'Samples/{req_data}')

Enter desired dataset : SampleSuperstore.csv


In [5]:
data.head(3)

Unnamed: 0,Ship Mode,Segment,Country,City,State,Postal Code,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit
0,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Bookcases,261.96,2,0.0,41.9136
1,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Chairs,731.94,3,0.0,219.582
2,Second Class,Corporate,United States,Los Angeles,California,90036,West,Office Supplies,Labels,14.62,2,0.0,6.8714


<u>Features and Labels</u>

In [6]:
X = data.iloc[:,:-1] #Features 
y = data.iloc[:,-1] #Labels

In [7]:
X.shape,y.shape

((9994, 12), (9994,))

<u>Data Transformation</u>

- Categorical to numerical

In [8]:
le = LabelEncoder()
for col in X.select_dtypes("O").columns:
    X[col] = le.fit_transform(X[col])

In [9]:
X.head(2)

Unnamed: 0,Ship Mode,Segment,Country,City,State,Postal Code,Region,Category,Sub-Category,Sales,Quantity,Discount
0,2,0,0,194,15,42420,2,0,4,261.96,2,0.0
1,2,0,0,194,15,42420,2,0,5,731.94,3,0.0


 - Scaling continious data

In [10]:
pipeline = Pipeline([
    ("robust scaler",RobustScaler()),
    ("std_scaler",StandardScaler())
])
for col in X.columns:
    X[col]=pipeline.fit_transform(np.array(X[col]).reshape(-1,1))

In [11]:
X.head(2)

Unnamed: 0,Ship Mode,Segment,Country,City,State,Postal Code,Region,Category,Sub-Category,Sales,Quantity,Discount
0,-0.213513,-0.864161,0.0,-0.617727,-0.473638,-0.398302,0.369413,-1.544978,-0.710815,0.051511,-0.804303,-0.756643
1,-0.213513,-0.864161,0.0,-0.617727,-0.473638,-0.398302,0.369413,-1.544978,-0.512842,0.805634,-0.354865,-0.756643


<u>Model Selection</u>

In [12]:
if y.dtype in [object]:
    estimators = {"KNearestNeighbor":KNeighborsClassifier(),
                 "SVM":SVC(),"DecisionTreeClassifier":DecisionTreeClassifier(),
                 "AdaBoostClassifier":AdaBoostClassifier(),
                 "GradientBoostingClassifier":GradientBoostingClassifier(),
                 "LogisticRegression":LogisticRegression()}
elif y.dtype in [float,int]:
    estimators = {"KNearestNeighbor":KNeighborsRegressor(),
                 "SVR":SVR(),"DecisionTreeRegressor":DecisionTreeRegressor(),
                 "AdaBoostRegressor":AdaBoostRegressor(),
                 "GradientBoostingRegressor":GradientBoostingRegressor(),
                 "LinearRegression":LinearRegression(),
                 "LassoRegression":Lasso(),"RidgeRegression":Ridge()}

In [15]:
estimators

{'KNearestNeighbor': KNeighborsRegressor(),
 'SVR': SVR(),
 'DecisionTreeRegressor': DecisionTreeRegressor(),
 'AdaBoostRegressor': AdaBoostRegressor(),
 'GradientBoostingRegressor': GradientBoostingRegressor(),
 'LinearRegression': LinearRegression(),
 'LassoRegression': Lasso(),
 'RidgeRegression': Ridge()}

<u>Training and Testing the model</u>

In [18]:
if y.dtype in [object]:
    str_kfold = StratifiedKFold(n_splits=10)
elif y.dtype in [float,int]:
    kfold = KFold(n_splits=10)

In [40]:
if y.dtype in [object]:
    model_train_scores,model_test_scores = [],[]
    for estimator in estimators.values():
        train_scores,test_scores = [],[]
        for train_index,test_index in str_kfold.split(X,y):
            X_train,X_test = X.iloc[train_index,:],X.iloc[test_index,:]
            y_train,y_test = y.iloc[train_index],y.iloc[test_index]
            estimator.fit(X_train,y_train)
            train_scores.append(estimator.score(X_train,y_train))
            test_scores.append(estimator.score(X_test,y_test))
        model_train_scores.append(np.array(train_scores).mean())
        model_test_scores.append(np.array(test_scores).mean())
    df = pd.DataFrame({"Model":estimators.values(),
             "Train Score":model_train_scores,
             "Test Score":model_test_scores})
    df["Model"] = df["Model"].apply(lambda x:str(x)[:-2])
elif y.dtype in [float,int]:
    model_train_errors,model_test_errors = [],[]
    for estimator in estimators.values():
        train_errors,test_errors = [],[]
        for train_index,test_index in kfold.split(X,y):
            X_train,X_test = X.iloc[train_index,:],X.iloc[test_index,:]
            y_train,y_test = y.iloc[train_index],y.iloc[test_index]
            estimator.fit(X_train,y_train)
            train_pred = estimator.predict(X_train)
            test_pred = estimator.predict(X_test)
            train_errors.append(mean_squared_error(y_train,train_pred))
            test_errors.append(mean_squared_error(y_test,test_pred))
        model_train_errors.append(np.array(train_errors).mean())
        model_test_errors.append(np.array(test_errors).mean())
    df = pd.DataFrame({"Model":estimators.values(),
                 "Train Errors":np.sqrt(np.array(model_train_errors)),
                 "Test Errors":np.sqrt(np.array(model_test_errors))})
    df["Model"] = df["Model"].apply(lambda x:str(x)[:-2])

In [41]:
df

Unnamed: 0,Model,Train Errors,Test Errors
0,KNeighborsRegressor,142.668557,179.734198
1,SVR,229.70533,229.854147
2,DecisionTreeRegressor,0.599631,157.916586
3,AdaBoostRegressor,183.934289,210.216682
4,GradientBoostingRegressor,45.62567,119.314138
5,LinearRegression,197.795776,204.897067
6,Lasso,197.8283,204.878082
7,Ridge,197.795777,204.896002
