# KNN Regression

##  Problem Statement

### Predict Item Sales Price

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv("Data/Train.csv")

In [3]:
df.shape

(8523, 12)

In [4]:
df.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [5]:
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

1. Item_Outlet_Sales is dependent varaible. Others are independent variables
2. No need to change any data type
3. Total Number of independent variables: 11
4. Categorical independent Variables: 7 variables -> Item_Identifier, Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
5.Continuous independent varaibles: 4 variables - > Item_Weight,
6. Model: Supervised Learning; As Target/Dependent varaible is continuous in nature, regression model is better suitable model.


| Column | Description|
|---|--|
|Item_Identifier | Unique Prodcut ID|
|Item_Weight  | Weight Of the product|
|Item_Fat_Content | Fat content in thhe product|
|Item_Visibility | %of total display are of the product|
|Item_Type | Category of product|
|Item_MRP  |Maximum retail price of the product|
|Outlet_Identifier| Unique Store ID|
|Outlet_Establishment_Year | Store Estabilishment Year|
|Outlet_Size| size of the store|
|Outlet_Location_Type  | Type of the city where store is located|
|Outlet_Type| Type of the outlet (super market or grocery store)|
|Item_Outlet_Sales|Sales of the product in the particular store|

In [6]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Hypothesis

1. Items sales increases with decrease in FAT content
2. MRP of the product affects te sales price for the particular product category
3. Size and city of the product affect the sales price of the item
4. Outlet type affect the sales of the product. 

# Missing Values

In [7]:
df.describe(include='all')

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
count,8523,7060.0,8523,8523.0,8523,8523.0,8523,8523.0,6113,8523,8523,8523.0
unique,1559,,5,,16,,10,,3,3,4,
top,FDW13,,Low Fat,,Fruits and Vegetables,,OUT027,,Medium,Tier 3,Supermarket Type1,
freq,10,,5089,,1232,,935,,2793,3350,5577,
mean,,12.857645,,0.066132,,140.992782,,1997.831867,,,,2181.288914
std,,4.643456,,0.051598,,62.275067,,8.37176,,,,1706.499616
min,,4.555,,0.0,,31.29,,1985.0,,,,33.29
25%,,8.77375,,0.026989,,93.8265,,1987.0,,,,834.2474
50%,,12.6,,0.053931,,143.0128,,1999.0,,,,1794.331
75%,,16.85,,0.094585,,185.6437,,2004.0,,,,3101.2964


Missing Values in : Item_Weight, Outlet_Size

In [8]:
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

## separate features and target

In [9]:
#separate features and target
x=df.drop(['Item_Identifier','Item_Outlet_Sales'],axis=1)
y=df['Item_Outlet_Sales']

In [10]:
x.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


In [11]:
x.dtypes

Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
dtype: object

In [12]:
x.isnull().sum()

Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [13]:
x['Item_Weight'].describe()

count    7060.000000
mean       12.857645
std         4.643456
min         4.555000
25%         8.773750
50%        12.600000
75%        16.850000
max        21.350000
Name: Item_Weight, dtype: float64

In [14]:
mean_val=x['Item_Weight'].mean()
x['Item_Weight'].fillna(value=mean_val,inplace=True)

In [15]:
x['Item_Weight'].head(10)

0     9.300000
1     5.920000
2    17.500000
3    19.200000
4     8.930000
5    10.395000
6    13.650000
7    12.857645
8    16.200000
9    19.200000
Name: Item_Weight, dtype: float64

In [16]:
x['Outlet_Size'].mode()

0    Medium
dtype: object

In [17]:
x['Outlet_Size'].isnull().sum()

2410

In [18]:
mode_val=x['Outlet_Size'].mode()[0]
mode_val

'Medium'

In [19]:
x['Outlet_Size'].fillna(value=mode_val,inplace=True)

In [20]:
x['Outlet_Size'].isnull().sum()

0

In [21]:
x.isnull().sum()

Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

## Handling Categorical Varaibles

In [22]:
x.dtypes

Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
dtype: object

In [23]:
features=x.columns
features

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [24]:
x=pd.get_dummies(features)

In [25]:
x.head()

Unnamed: 0,Item_Fat_Content,Item_MRP,Item_Type,Item_Visibility,Item_Weight,Outlet_Establishment_Year,Outlet_Identifier,Outlet_Location_Type,Outlet_Size,Outlet_Type
0,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0


In [26]:
x.shape,y.shape

((10, 10), (8523,))

##  Scaling the data (Using MinMax Scaler)

In [27]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
x_scaled=scaler.fit_transform(x)
x=pd.DataFrame(x_scaled)

## split train and test dataset

In [28]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(x,y,random_state=56)

ValueError: Found input variables with inconsistent numbers of samples: [10, 8523]

In [29]:
train_x.shape,train_y.shape,test_x.shape,test_y.shape

NameError: name 'train_x' is not defined

## Implementing KNN Regressor

In [30]:
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.metrics import mean_squared_error as mse

In [31]:
reg=KNN(n_neighbors=5)
reg.fit(train_x,train_y)
test_predict=reg.predict(test_x)
mse_val=mse(test_predict,test_y)
print('MSE =', mse_val)

NameError: name 'train_x' is not defined

## Elbow for the classfier

In [32]:
def Elbow(K):
    test_mse=[]
    for i in K:
        reg_i=KNN(n_neighbors=i)
        reg_i.fit(train_x,train_y)
        test_predict_i=reg_i.predict(test_x)
        mse_val_i=mse(test_predict_i,test_y)
        test_mse.append(mse_val_i)
    return test_mse

In [33]:
k=range(1,40)

In [34]:
test_k=Elbow(k)

NameError: name 'train_x' is not defined

In [35]:
plt.plot(k,test_k)

NameError: name 'test_k' is not defined

In [36]:
reg=KNN(n_neighbors=9)
reg.fit(train_x,train_y)
test_predict=reg.predict(test_x)
mse_val=mse(test_predict,test_y)
print('MSE =', mse_val)

NameError: name 'train_x' is not defined