In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.txt')
test = pd.read_csv('test.txt')

train.head(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [4]:
# Split columns into categoriacal and numerical
train.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [10]:
numerical = ["Item_Weight", "Item_Visibility", "Outlet_Establishment_Year", "Item_Outlet_Sales"]
categorical = [x for x in train.columns if x not in numerical]

print(numerical)
print(categorical)

['Item_Weight', 'Item_Visibility', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']
['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [12]:
train[categorical].isnull().sum()

Item_Identifier            0
Item_Fat_Content           0
Item_Type                  0
Item_MRP                   0
Outlet_Identifier          0
Outlet_Size             2410
Outlet_Location_Type       0
Outlet_Type                0
dtype: int64

In [24]:
# To handle null values, we associate outlet type with outlet size 
# and by using K-means clustering we will fill the required values
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

fill_na_train = train[["Outlet_Size", "Outlet_Type"]][train["Outlet_Size"].notnull()]
fill_na_test = train[["Outlet_Size", "Outlet_Type"]][train["Outlet_Size"].isnull()]

print(fill_na_train.head())
print(fill_na_test.head())

  Outlet_Size        Outlet_Type
0      Medium  Supermarket Type1
1      Medium  Supermarket Type2
2      Medium  Supermarket Type1
4        High  Supermarket Type1
5      Medium  Supermarket Type2
   Outlet_Size        Outlet_Type
3          NaN      Grocery Store
8          NaN  Supermarket Type1
9          NaN  Supermarket Type1
25         NaN  Supermarket Type1
28         NaN      Grocery Store


In [33]:
encoder = LabelEncoder()
fill_na_train['Outlet_Size'] = encoder.fit_transform(fill_na_train['Outlet_Size'])
fill_na_test['Outlet_Size'] = encoder.fit_transform(fill_na_test['Outlet_Size'])
fill_na_train['Outlet_Type'] = encoder.fit_transform(fill_na_train['Outlet_Type'])
fill_na_test['Outlet_Type'] = encoder.fit_transform(fill_na_test['Outlet_Type'])

fill_na_train.describe()

Unnamed: 0,Outlet_Size,Outlet_Type
count,6113.0,6113.0
mean,1.238181,1.37134
std,0.697463,0.843928
min,0.0,0.0
25%,1.0,1.0
50%,1.0,1.0
75%,2.0,2.0
max,2.0,3.0


In [41]:
X = fill_na_train.iloc[:, :].values
y_test = fill_na_test['Outlet_Type'].values
y_test

array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

In [37]:
kmeans = KMeans(n_clusters=2, max_iter=5000, algorithm='auto')
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=5000,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)