In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [77]:
%pwd

'f:\\AI-ML\\myprojects\\supply_chain_management'

In [78]:
# import os

# os.chdir('../')

In [79]:
data= pd.read_csv(r"artifacts\data\data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Location_type,WH_capacity_size,zone,WH_regional_zone,num_refill_req_l3m,transport_issue_l1y,Competitor_in_mkt,retail_shop_num,wh_owner_type,...,flood_proof,electric_supply,dist_from_hub,workers_num,storage_issue_reported_l3m,temp_reg_mach,approved_wh_govt_certificate,wh_breakdown_l3m,govt_check_l3m,product_wg_ton
0,0,Urban,Small,West,Zone 6,3,1,2,4651,Rented,...,1,1,91,29.0,13,0,A,5,15,17115
1,1,Rural,Large,North,Zone 5,0,0,4,6217,Company Owned,...,0,1,210,31.0,4,0,A,3,17,5074
2,2,Rural,Mid,South,Zone 2,1,0,4,4306,Company Owned,...,0,0,161,37.0,17,0,A,6,22,23137
3,3,Rural,Mid,North,Zone 3,7,4,2,6000,Rented,...,0,0,103,21.0,17,1,A+,3,27,22115
4,4,Rural,Large,North,Zone 5,3,1,2,4740,Company Owned,...,0,1,112,25.0,18,0,C,6,24,24071


In [80]:
data.drop(['Unnamed: 0'], axis=1, inplace= True)

In [81]:
data.columns

Index(['Location_type', 'WH_capacity_size', 'zone', 'WH_regional_zone',
       'num_refill_req_l3m', 'transport_issue_l1y', 'Competitor_in_mkt',
       'retail_shop_num', 'wh_owner_type', 'distributor_num', 'flood_impacted',
       'flood_proof', 'electric_supply', 'dist_from_hub', 'workers_num',
       'storage_issue_reported_l3m', 'temp_reg_mach',
       'approved_wh_govt_certificate', 'wh_breakdown_l3m', 'govt_check_l3m',
       'product_wg_ton'],
      dtype='object')

In [82]:
data.isnull().sum()

Location_type                   0
WH_capacity_size                0
zone                            0
WH_regional_zone                0
num_refill_req_l3m              0
transport_issue_l1y             0
Competitor_in_mkt               0
retail_shop_num                 0
wh_owner_type                   0
distributor_num                 0
flood_impacted                  0
flood_proof                     0
electric_supply                 0
dist_from_hub                   0
workers_num                     0
storage_issue_reported_l3m      0
temp_reg_mach                   0
approved_wh_govt_certificate    0
wh_breakdown_l3m                0
govt_check_l3m                  0
product_wg_ton                  0
dtype: int64

In [83]:
# count of outliers and percentage of outliers
print("Feature"+" "*23+"outlier count"+" "*5+"outlier percentage" )
print("-"*70)
outlier_containing_features=[]
for feature in data.select_dtypes(['int','float']):
    outlier_count= 0
    Q1= data[feature].quantile(0.25)
    Q3= data[feature].quantile(0.75)
    IQR= Q3 - Q1
    outlier_count= (data[feature]>(Q3+1.5*IQR)).sum() + (data[feature]<(Q1-1.5*IQR)).sum()
    if(outlier_count>0):
        outlier_containing_features.append(feature)
        print(feature," "*(32-len(feature)), outlier_count," "*(20-len(str(outlier_count))), 
          f'{((outlier_count/25000)*100):.2f} %')

Feature                       outlier count     outlier percentage
----------------------------------------------------------------------
transport_issue_l1y               2943                  11.77 %
Competitor_in_mkt                 96                    0.38 %
retail_shop_num                   948                   3.79 %
flood_impacted                    2454                  9.82 %
flood_proof                       1366                  5.46 %
workers_num                       607                   2.43 %


In [84]:
# This method aims to cap extreme values by setting them to the nearest reasonable boundary within the 1.5 * IQR range

data_treated= data.copy()
for feature in ['transport_issue_l1y','Competitor_in_mkt','retail_shop_num','workers_num']:
    Q1= data[feature].quantile(0.25)
    Q3= data[feature].quantile(0.75)
    IQR= Q3 - Q1
    lower_limit= Q1-1.5*IQR
    upper_limit= Q3+1.5*IQR
    data_treated.loc[data_treated[feature]<lower_limit,feature]= lower_limit
    data_treated.loc[data_treated[feature]>upper_limit,feature]= upper_limit

In [85]:
# count of outliers and percentage of outliers
print("Feature"+" "*23+"outlier count"+" "*5+"outlier percentage" )
print("-"*70)
outlier_containing_features=[]
for feature in data_treated.select_dtypes(['int','float']):
    outlier_count= 0
    Q1= data_treated[feature].quantile(0.25)
    Q3= data_treated[feature].quantile(0.75)
    IQR= Q3 - Q1
    outlier_count= (data_treated[feature]>(Q3+1.5*IQR)).sum() + (data_treated[feature]<(Q1-1.5*IQR)).sum()
    if(outlier_count>0):
        outlier_containing_features.append(feature)
        print(feature," "*(32-len(feature)), outlier_count," "*(20-len(str(outlier_count))), 
          f'{((outlier_count/25000)*100):.2f} %')

Feature                       outlier count     outlier percentage
----------------------------------------------------------------------
flood_impacted                    2454                  9.82 %
flood_proof                       1366                  5.46 %


In [86]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

In [87]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

nominal_features= ['Location_type','zone','WH_regional_zone','wh_owner_type']
ordinal_features= ['WH_capacity_size','approved_wh_govt_certificate']
columns= ['num_refill_req_l3m', 'transport_issue_l1y', 'Competitor_in_mkt',
       'retail_shop_num', 'distributor_num', 'flood_impacted',
       'flood_proof', 'electric_supply', 'dist_from_hub', 'workers_num',
       'storage_issue_reported_l3m', 'temp_reg_mach',
        'wh_breakdown_l3m', 'govt_check_l3m',
        ]

preprocessor= ColumnTransformer(
    transformers=[
        ('OneHotEncoding', OneHotEncoder(), nominal_features),
        ('LabelEncoding', OrdinalEncoder(), ordinal_features),
        ('StandardScaler', StandardScaler(), columns),
                
    ], remainder='passthrough'
)

In [88]:
# nominal_features= ['Location_type','zone','WH_regional_zone','wh_owner_type']
# ordinal_features= ['WH_capacity_size','approved_wh_govt_certificate']
# columns= ['num_refill_req_l3m', 'transport_issue_l1y', 'Competitor_in_mkt',
#        'retail_shop_num', 'distributor_num', 'flood_impacted',
#        'flood_proof', 'electric_supply', 'dist_from_hub', 'workers_num',
#        'storage_issue_reported_l3m', 'temp_reg_mach',
#         'wh_breakdown_l3m', 'govt_check_l3m',
#         ]

# one_hot= OneHotEncoder()
# for feature in nominal_features:
#     data[feature]= one_hot.fit_transform()


In [89]:
preprocessor.fit(data_treated.drop('product_wg_ton', axis=1))

In [90]:
import pickle


with open('artifacts\preprocessor.pkl', 'wb') as preprocessor_file:
    pickle.dump(preprocessor, preprocessor_file) 

In [91]:
from sklearn.model_selection import train_test_split

X= data_treated.iloc[:,:-1]
y= data_treated.iloc[:,-1]

In [92]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=1)

In [93]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score

In [94]:
def evaluate_model(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    mae = mean_absolute_error(actual, predicted)
    rmse= np.sqrt(mse)
    r2 = r2_score(actual, predicted)
    
    return r2, mae, mse, rmse

In [95]:
models= {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(max_iter=1000),
    'Ridge':Ridge(),
    'RandomForestRegressor':RandomForestRegressor(n_estimators=100)
}

In [96]:
x_train= preprocessor.transform(X_train)
x_test= preprocessor.transform(X_test)

In [97]:
for model_name, model in models.items():
    print(model_name)
    model.fit(x_train, y_train)
#     y_train_pred= model.predict(X_train)
    y_pred_test= model.predict(x_test)
    r2, mae, mse, rmse= evaluate_model(y_test, y_pred_test)
    print(model_name)
    print('Model Training Performance')
    print(f"MSE: {mse:.2f}",)
    print(f"RMSE: {rmse:.2f}",)
    print(f"MAE: {mae:.2f}",)
    print(f"R2 score: {r2*100:.2f}",)
    print("=" *30)

LinearRegression
LinearRegression
Model Training Performance
MSE: 2913006.24
RMSE: 1706.75
MAE: 1263.13
R2 score: 97.82
Lasso
Lasso
Model Training Performance
MSE: 2912479.72
RMSE: 1706.60
MAE: 1262.74
R2 score: 97.82
Ridge
Ridge
Model Training Performance
MSE: 2913884.93
RMSE: 1707.01
MAE: 1263.34
R2 score: 97.82
RandomForestRegressor
RandomForestRegressor
Model Training Performance
MSE: 874989.98
RMSE: 935.41
MAE: 700.11
R2 score: 99.34


In [98]:
Lin_Reg= LinearRegression()
Lin_Reg.fit(x_train, y_train)
y_pred_test= Lin_Reg.predict(x_test)
print(f"R2 of Linear regression : {(r2_score(y_test, y_pred_test))*100:.2f} %")

R2 of Linear regression : 97.82 %


In [99]:
import pickle

with open('artifacts\model.pkl', 'wb') as model_file:
    pickle.dump(Lin_Reg, model_file)

# Prediction

In [100]:
with open('artifacts\preprocessor.pkl','rb') as preprocessor_file:
    preprocessor= pickle.load(preprocessor_file)

In [101]:
pd.DataFrame([X_train.iloc[1,:]]).index

Index([6085], dtype='int64')

In [102]:
with open('artifacts\model.pkl','rb') as model_file:
    model= pickle.load(model_file)

In [103]:
pd.DataFrame([X_train.iloc[2,:]]).values

array([['Rural', 'Small', 'South', 'Zone 6', 3, 2.0, 3, 4733.0, 'Rented',
        41, 0, 0, 0, 175, 27.0, 20, 0, 'A', 4, 2]], dtype=object)

In [104]:
columns= ['Location_type', 'WH_capacity_size', 'zone', 'WH_regional_zone',
       'num_refill_req_l3m', 'transport_issue_l1y', 'Competitor_in_mkt',
       'retail_shop_num', 'wh_owner_type', 'distributor_num', 'flood_impacted',
       'flood_proof', 'electric_supply', 'dist_from_hub', 'workers_num',
       'storage_issue_reported_l3m', 'temp_reg_mach',
       'approved_wh_govt_certificate', 'wh_breakdown_l3m', 'govt_check_l3m']

values= [['Rural', 'Small', 'South', 'Zone 6', 3, 2.0, 3, 4733.0, 'Rented',
        41, 0, 0, 0, 175, 27.0, 20, 0, 'A', 4, 2]]

input_data= pd.DataFrame(values, columns= columns)
pred_data= preprocessor.transform(input_data)
model.predict(pred_data)

array([24976.125])

In [105]:
y_train.iloc[2]

25148

In [106]:
def build_json(n, data):
    print("{")
    for i,j in data.iloc[n,:].items():
        print(f"\"{i}\":\"{j}\",")
    print("}")
    

In [107]:
build_json(1000, X_test)

{
"Location_type":"Rural",
"WH_capacity_size":"Small",
"zone":"North",
"WH_regional_zone":"Zone 6",
"num_refill_req_l3m":"5",
"transport_issue_l1y":"1.0",
"Competitor_in_mkt":"3",
"retail_shop_num":"4105.0",
"wh_owner_type":"Company Owned",
"distributor_num":"66",
"flood_impacted":"0",
"flood_proof":"0",
"electric_supply":"0",
"dist_from_hub":"104",
"workers_num":"24.0",
"storage_issue_reported_l3m":"8",
"temp_reg_mach":"0",
"approved_wh_govt_certificate":"B",
"wh_breakdown_l3m":"5",
"govt_check_l3m":"9",
}


In [108]:
y_test.iloc[1000]

10059

In [6]:
cols=['Location_type', 'WH_capacity_size', 'zone', 'WH_regional_zone',
       'num_refill_req_l3m', 'transport_issue_l1y', 'Competitor_in_mkt',
       'retail_shop_num', 'wh_owner_type', 'distributor_num', 'flood_impacted',
       'flood_proof', 'electric_supply', 'dist_from_hub', 'workers_num',
       'storage_issue_reported_l3m', 'temp_reg_mach',
       'approved_wh_govt_certificate', 'wh_breakdown_l3m', 'govt_check_l3m']
# print('[', end='')
for i in cols:
    # print(f'{i}= float(request.form.get(\'{i}\')) ')
    print(f'<input name="{i}" placeholder="{i}">')
    # print(i+',', end='')
# print(']', end='')


<input name="Location_type" placeholder="Location_type">
<input name="WH_capacity_size" placeholder="WH_capacity_size">
<input name="zone" placeholder="zone">
<input name="WH_regional_zone" placeholder="WH_regional_zone">
<input name="num_refill_req_l3m" placeholder="num_refill_req_l3m">
<input name="transport_issue_l1y" placeholder="transport_issue_l1y">
<input name="Competitor_in_mkt" placeholder="Competitor_in_mkt">
<input name="retail_shop_num" placeholder="retail_shop_num">
<input name="wh_owner_type" placeholder="wh_owner_type">
<input name="distributor_num" placeholder="distributor_num">
<input name="flood_impacted" placeholder="flood_impacted">
<input name="flood_proof" placeholder="flood_proof">
<input name="electric_supply" placeholder="electric_supply">
<input name="dist_from_hub" placeholder="dist_from_hub">
<input name="workers_num" placeholder="workers_num">
<input name="storage_issue_reported_l3m" placeholder="storage_issue_reported_l3m">
<input name="temp_reg_mach" plac

In [None]:
[Location_type,WH_capacity_size,zone,WH_regional_zone,num_refill_req_l3m,transport_issue_l1y,Competitor_in_mkt,retail_shop_num,wh_owner_type,distributor_num,flood_impacted,flood_proof,electric_supply,dist_from_hub,workers_num,storage_issue_reported_l3m,temp_reg_mach,approved_wh_govt_certificate,wh_breakdown_l3m,govt_check_l3m,