In [52]:
import pandas as pd 
import numpy as np 
import matplotlib as plt
from sklearn.model_selection import train_test_split

In [34]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Data exploration

In [35]:
df_train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,Y
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,8.23
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,6.09
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,7.65
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,6.6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,6.9


In [36]:
df_train["X1"].value_counts()

X1
FDP28    8
DRE49    8
NCF42    8
NCQ43    8
FDW24    8
        ..
FDN50    1
FDY43    1
FDM57    1
FDZ50    1
FDX13    1
Name: count, Length: 1553, dtype: int64

In [14]:
df_train["X9"].value_counts()

X9
Medium    1935
Small     1682
High       672
Name: count, dtype: int64

In [29]:
frequency = df_train['X9'].value_counts(normalize=True)
print(frequency)

print("\nThe size if the Data frame is: " , df_train.shape)

X9
Medium    0.451154
Small     0.392166
High      0.156680
Name: proportion, dtype: float64

The size if the Data frame is:  (6000, 12)


# Data cleaning

### We need to handle columns With missing values

In [43]:
df_train = df_train.drop_duplicates()
df_train

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,Y
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,8.23
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,6.09
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,7.65
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,6.60
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,6.90
...,...,...,...,...,...,...,...,...,...,...,...,...
5995,FDB32,20.600,Low Fat,0.023586,Fruits and Vegetables,94.7778,OUT017,2007,,Tier 2,Supermarket Type1,7.18
5996,FDJ16,9.195,Low Fat,0.115064,Frozen Foods,58.6246,OUT049,1999,Medium,Tier 1,Supermarket Type1,6.77
5997,FDJ32,10.695,Low Fat,0.057910,Fruits and Vegetables,60.4536,OUT045,2002,,Tier 2,Supermarket Type1,5.21
5998,FDO12,15.750,Low Fat,0.054920,Baking Goods,195.8452,OUT035,2004,Small,Tier 2,Supermarket Type1,8.50


No raw removed Which mean no duplicates in the dataframe

### Now we fill the missing values in the dataframe 

In [44]:
df_train["X2"].isnull().value_counts()

X2
False    4994
True     1006
Name: count, dtype: int64

In [45]:
df_train["X9"].isnull().value_counts()

X9
False    4289
True     1711
Name: count, dtype: int64

In [46]:
df_train["X2"].fillna(df_train["X2"].median() , inplace=True )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["X2"].fillna(df_train["X2"].median() , inplace=True )


In [47]:
df_train["X2"].isnull().value_counts()

X2
False    6000
Name: count, dtype: int64

In [48]:
df_train["X9"].fillna( "missing" , inplace=True )

# df_filled = df_train.fillna({'X9': 'Unknown', 'X2': df_train['X2'].median()}) #short cut for the filling Code

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["X9"].fillna( "missing" , inplace=True )


In [49]:
df_train["X9"].isnull().value_counts()

X9
False    6000
Name: count, dtype: int64

In [50]:
df_train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,Y
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,8.23
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,6.09
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,7.65
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,missing,Tier 3,Grocery Store,6.6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,6.9


## Data Splitting

In [57]:
x = df_train.drop("Y" , axis = "columns")
y = df_train["Y"]

In [58]:
x_train,x_val,y_train,y_val = train_test_split(x,y , random_state=42 , test_size= 0.2)