### Import Data

In [1]:
import pandas as pd
df = pd.read_csv('big-mart-sales.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [2]:
df.shape

(8523, 12)

### Feature Engineering

#### Fill NaN Values

In [3]:
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [4]:
df.drop(['Item_Weight', 'Outlet_Size'], axis=1, inplace=True)
df.isnull().sum()

Item_Identifier              0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

#### Convert Categorical Variables to Numerical Variables

In [5]:
cat_cols = [col for col in df.columns if df[col].dtype=='O']
cat_cols

['Item_Identifier',
 'Item_Fat_Content',
 'Item_Type',
 'Outlet_Identifier',
 'Outlet_Location_Type',
 'Outlet_Type']

In [6]:
for col in cat_cols:
    print(col)
    print(df[col].unique())
    print()

Item_Identifier
['FDA15' 'DRC01' 'FDN15' ... 'NCF55' 'NCW30' 'NCW05']

Item_Fat_Content
['Low Fat' 'Regular' 'low fat' 'LF' 'reg']

Item_Type
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']

Outlet_Identifier
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']

Outlet_Location_Type
['Tier 1' 'Tier 3' 'Tier 2']

Outlet_Type
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']



In column `Item_Fat_Content`, `Low Fat`, `low fat`, and `LF` have the same meaning, same goes for `Regular` and `reg`.

In [7]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].apply(lambda x: x.lower()[0])
df.head()

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,l,0.016047,Dairy,249.8092,OUT049,1999,Tier 1,Supermarket Type1,3735.138
1,DRC01,r,0.019278,Soft Drinks,48.2692,OUT018,2009,Tier 3,Supermarket Type2,443.4228
2,FDN15,l,0.01676,Meat,141.618,OUT049,1999,Tier 1,Supermarket Type1,2097.27
3,FDX07,r,0.0,Fruits and Vegetables,182.095,OUT010,1998,Tier 3,Grocery Store,732.38
4,NCD19,l,0.0,Household,53.8614,OUT013,1987,Tier 3,Supermarket Type1,994.7052


In [8]:
print(df['Item_Fat_Content'].unique())

['l' 'r']


In [9]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

df[cat_cols] = encoder.fit_transform(df[cat_cols])
    
df.head()

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156.0,0.0,0.016047,4.0,249.8092,9.0,1999,0.0,1.0,3735.138
1,8.0,1.0,0.019278,14.0,48.2692,3.0,2009,2.0,2.0,443.4228
2,662.0,0.0,0.01676,10.0,141.618,9.0,1999,0.0,1.0,2097.27
3,1121.0,1.0,0.0,6.0,182.095,0.0,1998,2.0,0.0,732.38
4,1297.0,0.0,0.0,9.0,53.8614,1.0,1987,2.0,1.0,994.7052



### Get Dependent and Independent Variables

In [10]:
X = df.drop('Item_Outlet_Sales', axis=1)
y = df['Item_Outlet_Sales']

#### Scale Data

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

### Train-Test Split

In [12]:
from sklearn.model_selection import train_test_split

seed = 2022
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)

print('Training Data:', X_train.shape)
print('Test Data:', X_test.shape)

Training Data: (6392, 9)
Test Data: (2131, 9)


### Model Building

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

linear = LinearRegression()
tree = DecisionTreeRegressor()
forest = RandomForestRegressor()
svm = SVR()
knn = KNeighborsRegressor()
grad = GradientBoostingRegressor()


### Model Training and Evaluation

In [14]:
models = [linear, tree, forest, svm, knn, grad]
model_names = [
    'Linear Regression',
    'Decision Tree',
    'Random Forest',
    'Support Vector Machines',
    'K Nearest Neighbors',
    'Gradient Boosting'
]

In [15]:
from sklearn.metrics import mean_absolute_error, r2_score

for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_pred, y_test)
    r2 = r2_score(y_pred, y_test)
    
    print(model_name)
    print('========================')
    print('Mean Absolute Error: %.4f'%(mae))
    print('R Squared: %.2f%%'%(r2*100))
    print()

Linear Regression
Mean Absolute Error: 898.3784
R Squared: -0.28%

Decision Tree
Mean Absolute Error: 1085.0547
R Squared: 15.71%

Random Forest
Mean Absolute Error: 800.4028
R Squared: 28.39%

Support Vector Machines
Mean Absolute Error: 1225.2807
R Squared: -21641.04%

K Nearest Neighbors
Mean Absolute Error: 815.2561
R Squared: 22.97%

Gradient Boosting
Mean Absolute Error: 758.7640
R Squared: 27.28%



In [16]:
print('Mean Item Outlet Sale Price: %.4f'%y.mean())

Mean Item Outlet Sale Price: 2181.2889
