In [14]:
import pandas as pd

df = pd.read_csv('data/bigmart.csv')

In [15]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [16]:
df.shape

(8523, 12)

In [17]:
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [18]:
df[['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size',
    'Outlet_Location_Type', 'Outlet_Type']].nunique()

Item_Identifier         1559
Item_Fat_Content           5
Item_Type                 16
Outlet_Identifier         10
Outlet_Size                3
Outlet_Location_Type       3
Outlet_Type                4
dtype: int64

In [19]:
df.dropna(inplace=True)

In [23]:
from sklearn.preprocessing import LabelEncoder

df['Item_Identifier'] = LabelEncoder().fit_transform(df['Item_Identifier'])
df['Item_Fat_Content'] = LabelEncoder().fit_transform(df['Item_Fat_Content'])
df['Item_Type'] = LabelEncoder().fit_transform(df['Item_Type'])
df['Outlet_Identifier'] = LabelEncoder().fit_transform(df['Outlet_Identifier'])
df['Outlet_Size'] = LabelEncoder().fit_transform(df['Outlet_Size'])
df['Outlet_Location_Type'] = LabelEncoder().fit_transform(df['Outlet_Location_Type'])
df['Outlet_Type'] = LabelEncoder().fit_transform(df['Outlet_Type'])

In [24]:
from sklearn.model_selection import cross_val_score, ShuffleSplit

X = df.drop(['Item_Outlet_Sales'], axis=1)
y = df['Item_Outlet_Sales']

In [39]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor

models = [
    LinearRegression(),
    RandomForestRegressor(),
    DecisionTreeRegressor(min_samples_leaf=100),
    LGBMRegressor()
]

for model in models:
    scores = cross_val_score(model, X, y, scoring='r2', cv=ShuffleSplit(n_splits=10, test_size=0.2))
    print("Model %s score: %0.2f (+/- %0.2f)" % (model.__class__.__name__, scores.mean(), scores.std()))

Model LinearRegression score: 0.46 (+/- 0.02)
Model RandomForestRegressor score: 0.37 (+/- 0.02)
Model DecisionTreeRegressor score: 0.44 (+/- 0.02)
Model LGBMRegressor score: 0.41 (+/- 0.02)


In [43]:
models = [
    LinearRegression(),
    RandomForestRegressor(),
    DecisionTreeRegressor(min_samples_leaf=100),
    LGBMRegressor()
]

for model in models:
    scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=ShuffleSplit(n_splits=10, test_size=0.2))
    print("Model %s score: %0.2f (+/- %0.2f)" % (model.__class__.__name__, scores.mean(), scores.std()))

Model LinearRegression score: -802.75 (+/- 25.48)
Model RandomForestRegressor score: -865.50 (+/- 18.81)
Model DecisionTreeRegressor score: -836.24 (+/- 17.26)
Model LGBMRegressor score: -845.96 (+/- 24.12)


In [40]:
from sklearn.metrics import make_scorer
import numpy as np

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [44]:
models = [
    LinearRegression(),
    RandomForestRegressor(),
    DecisionTreeRegressor(min_samples_leaf=100),
    LGBMRegressor()
]

for model in models:
    scores = cross_val_score(model, X, y, scoring=rmse_scorer, cv=ShuffleSplit(n_splits=10, test_size=0.2))
    print("Model %s score: %0.2f (+/- %0.2f)" % (model.__class__.__name__, scores.mean(), scores.std()))

Model LinearRegression score: -1085.88 (+/- 46.39)
Model RandomForestRegressor score: -1211.28 (+/- 22.97)
Model DecisionTreeRegressor score: -1110.29 (+/- 46.60)
Model LGBMRegressor score: -1154.70 (+/- 40.48)
