In [1]:
import seaborn as sns

In [2]:
tips_df = sns.load_dataset('tips')

In [3]:
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# there are some clear challenge with the dataset

In [6]:
## detecting NaN value
## undetected NaN values can be very problematic for training and may even cause the training process to fail.

In [5]:
tips_df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [None]:
# scanning nan value along the row

In [8]:
tips_df.isna().any(axis=1)

0      False
1      False
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242    False
243    False
Length: 244, dtype: bool

In [9]:
# scanning nan value along the column
tips_df.isna().any()

total_bill    False
tip           False
sex           False
smoker        False
day           False
time          False
size          False
dtype: bool

In [10]:
# in this dataset there are 4 categorical variable: sex, smoker, day, and time
# problematic because the mathematical models underpinning our ML system only understand numeric inputs. 

### label encoding and onehot encoding

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
label_encoding = LabelEncoder()

In [14]:
tips_df.iloc[:,[2,3,4,5]]=tips_df.iloc[:,[2,3,4,5]].apply(label_encoding.fit_transform)

In [15]:
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


In [16]:
# problem with labelencoder
# encoding categorical variables that are nominal (where the values of the variable can't be ordered; for example,
# gender, days in a week, color, and so on) and not ordinal 
# (the values of the variable can be ordered; for example, rank, size, and so on) creates another complication.

In [17]:
# thus we choose One-hot encoding for such case

#  Please note that we will be applying one-hot encoding only after performing label encoding.

In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [19]:
# he OneHotEncoder class goes as an argument to the ColumnTransformer object which tells our program what kind of 
# transformation we seek

In [22]:
oh_encoding = ColumnTransformer([('OneHotEncoding', OneHotEncoder(),  [2,3,4,5])],remainder='passthrough')

In [24]:
tips_df_ohe = oh_encoding.fit_transform(tips_df)
tips_df_ohe

array([[ 1.  ,  0.  ,  1.  , ..., 16.99,  1.01,  2.  ],
       [ 0.  ,  1.  ,  1.  , ..., 10.34,  1.66,  3.  ],
       [ 0.  ,  1.  ,  1.  , ..., 21.01,  3.5 ,  3.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 22.67,  2.  ,  2.  ],
       [ 0.  ,  1.  ,  1.  , ..., 17.82,  1.75,  2.  ],
       [ 1.  ,  0.  ,  1.  , ..., 18.78,  3.  ,  2.  ]])

In [25]:
tips_df_ohe.shape

(244, 13)

In [27]:
tips_df.shape

(244, 7)

#### dummy variable trap. 

In [28]:
# resolve and that is the issue of the dummy variable trap. 

In [30]:
# cause collinearity in our data (high correlation between variables) because we can always predict the outcome
# of the fourth column with the three other columns (if the day is not Friday, Saturday, or Sunday, then it will
#have to be Thursday). 

### Data Standarization

In [31]:
# equalize the range of values for all the columns. 

In [32]:
# min-max standarization: Post transformation, the range of the column becomes [0, 1].

In [33]:
from sklearn.preprocessing import MinMaxScaler

In [34]:
minmax = MinMaxScaler()
tips_df_std = minmax.fit_transform(tips_df_ohe)
tips_df_std

array([[1.        , 0.        , 1.        , ..., 0.29157939, 0.00111111,
        0.2       ],
       [0.        , 1.        , 1.        , ..., 0.1522832 , 0.07333333,
        0.4       ],
       [0.        , 1.        , 1.        , ..., 0.3757855 , 0.27777778,
        0.4       ],
       ...,
       [0.        , 1.        , 0.        , ..., 0.41055718, 0.11111111,
        0.2       ],
       [0.        , 1.        , 1.        , ..., 0.30896523, 0.08333333,
        0.2       ],
       [1.        , 0.        , 1.        , ..., 0.32907415, 0.22222222,
        0.2       ]])

In [35]:
# z-score is the numerical measurement of how many standard deviations away a value from the mean of the group is. 
# Post transformation, most values are expected to fall in the range of [-3, 3].


In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
std = StandardScaler()

In [38]:
tips_df_std = std.fit_transform(tips_df_ohe)
tips_df_std

array([[ 1.34335316e+00, -1.34335316e+00,  7.84789169e-01, ...,
        -3.14711305e-01, -1.43994695e+00, -6.00192629e-01],
       [-7.44405889e-01,  7.44405889e-01,  7.84789169e-01, ...,
        -1.06323531e+00, -9.69205340e-01,  4.53382921e-01],
       [-7.44405889e-01,  7.44405889e-01,  7.84789169e-01, ...,
         1.37779900e-01,  3.63355539e-01,  4.53382921e-01],
       ...,
       [-7.44405889e-01,  7.44405889e-01, -1.27422758e+00, ...,
         3.24629502e-01, -7.22971264e-01, -6.00192629e-01],
       [-7.44405889e-01,  7.44405889e-01,  7.84789169e-01, ...,
        -2.21286504e-01, -9.04025732e-01, -6.00192629e-01],
       [ 1.34335316e+00, -1.34335316e+00,  7.84789169e-01, ...,
        -1.13228903e-01,  1.24660453e-03, -6.00192629e-01]])