In [121]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder , OrdinalEncoder , StandardScaler
from sklearn.compose import ColumnTransformer
import joblib

In [45]:
data = pd.read_csv("bigmart.csv")

In [46]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [124]:
data["Item_Type"].value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [131]:
data["Outlet_Type"].value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

In [47]:
data.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [95]:
category_col =data.select_dtypes(exclude=np.number).drop(columns=["Item_Identifier","Item_Fat_Content","Outlet_Identifier","Outlet_Location_Type","Outlet_Size"]).columns

In [96]:
category_col

Index(['Item_Type', 'Outlet_Type'], dtype='object')

In [99]:
numeric_col = data.select_dtypes(include=np.number).drop(columns=["Item_Weight","Item_Visibility","Outlet_Establishment_Year","Item_Outlet_Sales"]).columns

In [100]:
numeric_col

Index(['Item_MRP'], dtype='object')

In [114]:
category_pipe =  Pipeline(steps=[('encode',OrdinalEncoder())])

In [115]:
numeric_pipe =  Pipeline(steps=[("impute",SimpleImputer()),('scale',StandardScaler())])

In [116]:
preprocessing_pipe = ColumnTransformer([
               ('cat',category_pipe,category_col),
               ('num',numeric_pipe,numeric_col) 
])

In [117]:
pipe = Pipeline([('preprocess',preprocessing_pipe),('LinearRegression',LinearRegression())])

In [118]:
X = data[["Item_Type","Outlet_Type","Item_MRP"]]

In [119]:
y = data["Item_Outlet_Sales"]

In [120]:
pipe.fit(X,y)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('encode',
                                                                   OrdinalEncoder())]),
                                                  Index(['Item_Type', 'Outlet_Type'], dtype='object')),
                                                 ('num',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  Index(['Item_MRP'], dtype='object'))])),
                ('LinearRegression', LinearRegression())])

In [126]:
test_data = pd.DataFrame(["Dairy","Supermarket Type1","300"]).T

In [127]:
test_data.columns = X.columns

In [128]:
test_data

Unnamed: 0,Item_Type,Outlet_Type,Item_MRP
0,Dairy,Supermarket Type1,300


In [129]:
pipe.predict(test_data)

array([4488.17473674])

In [None]:
#Model Dump

In [130]:
joblib.dump(pipe,"pipe.pkl")

['pipe.pkl']