In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder , OrdinalEncoder , StandardScaler
from sklearn.compose import ColumnTransformer
import joblib

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,Item_ID,Item_W,Item_Type,Item_MRP,Outlet_ID,Outlet_Year,Outlet_Size,Outlet_Location_Type,Sales
0,FDU32,21.027499,Baking Goods,197.352319,OUT046,2004,Small,Tier 2,2689.457781
1,NCT54,21.102371,Meat,148.250214,OUT035,1987,Small,Tier 1,3437.350375
2,FDW08,20.882263,Hard Drinks,205.46501,OUT035,1999,Small,Tier 3,3129.967268
3,FDJ22,21.050435,Starchy Foods,253.417583,OUT046,1996,Small,Tier 1,1306.514376
4,FDF47,21.247876,Baking Goods,240.871039,OUT035,1988,Small,Tier 3,1739.769829


In [4]:
category_col =data.select_dtypes(exclude=np.number).drop(columns=["Item_ID","Outlet_ID"]).columns

In [5]:
category_col

Index(['Item_Type', 'Outlet_Size', 'Outlet_Location_Type'], dtype='object')

In [6]:
numeric_col = data.select_dtypes(include=np.number).drop(columns=["Item_W","Sales"]).columns

In [7]:
numeric_col

Index(['Item_MRP', 'Outlet_Year'], dtype='object')

In [8]:
category_pipe =  Pipeline(steps=[('encode',OrdinalEncoder())])

In [9]:
numeric_pipe =  Pipeline(steps=[("impute",SimpleImputer()),('scale',StandardScaler())])

In [10]:
preprocessing_pipe = ColumnTransformer([
               ('cat',category_pipe,category_col),
               ('num',numeric_pipe,numeric_col) 
])

In [11]:
pipe = Pipeline([('preprocess',preprocessing_pipe),('GradientBoostingRegressor',GradientBoostingRegressor())])

In [12]:
X = data[["Item_Type","Item_MRP"]]

In [13]:
y = data["Sales"]

In [14]:
pipe.fit(X,y)

ValueError: A given column is not a column of the dataframe

In [72]:
pipe.predict(train_X)

array([1975.33334704, 1883.37411512, 1946.69283428, ..., 1884.86247259,
       1563.64080617, 1822.15140387])

In [54]:
X.head()

Unnamed: 0,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,...,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Item_MRP,Outlet_Year
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0.792848,0.608131
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,-0.022995,-1.925332
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0.927642,-0.137005
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,1.724386,-0.584087
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1.515922,-1.776305


In [21]:
joblib.dump(pipe,"pipe.pkl")

['pipe.pkl']