## One hot encoding and Feature Scaling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#To suppress values from exponentials to human readable format
np.set_printoptions(suppress=True)
#To show all the columns
pd.set_option('display.max_columns',None)

In [3]:
# For scaling the daataset using ( MinMaxScaler,StandardScaler,RobustScaler,LabelEncoder)
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler,LabelEncoder

### Loading the dataset

In [4]:
data = sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [6]:
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


### One hot encoding

##### Converting the categorical column to machine readable format as 0 and 1.  This method will change the variables on a column to different columns . 

##### for example , in sex column first column shows 1 for male and 0 for female and the second column shows 1 for female and 0 for male 

##### drop_first=True means droping 1st column from the new columns. since 0 for female on the first column means that this data gives a record of a male

In [7]:
data['SEX'] = pd.get_dummies(data=data.sex,drop_first=True)

In [8]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,SEX
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,1
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False,0
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True,0


### Converting the Entire dataframe 

##### here we are converting the entire column rather than converting a single column to identify whats happening after coverting the entire column using one hot encoding method

In [9]:
data_No_Dummies = pd.get_dummies(data=data)

In [10]:
data_No_Dummies

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,SEX,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third,who_child,who_man,who_woman,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
1,1,1,38.0,1,0,71.2833,False,False,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,53.1000,False,False,0,1,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1
6,0,1,54.0,0,0,51.8625,True,True,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0
10,1,3,4.0,1,1,16.7000,False,False,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
11,1,1,58.0,0,0,26.5500,False,True,0,1,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,47.0,1,1,52.5542,False,False,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
872,0,1,33.0,0,0,5.0000,True,True,1,0,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0
879,1,1,56.0,0,1,83.1583,False,False,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1
887,1,1,19.0,0,0,30.0000,False,True,0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1


In [11]:
data_No_Dummies.shape

(181, 32)

##### We can find that at the previous titanic dataset there are only 15 columns now it is 32 columns

In [12]:
data_with_Dummies = pd.get_dummies(data=data,drop_first=True)

In [13]:
data_with_Dummies.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,SEX,sex_male,embarked_Q,embarked_S,class_Second,class_Third,who_man,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Queenstown,embark_town_Southampton,alive_yes
1,1,1,38.0,1,0,71.2833,False,False,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1
3,1,1,35.0,1,0,53.1,False,False,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1
6,0,1,54.0,0,0,51.8625,True,True,1,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0
10,1,3,4.0,1,1,16.7,False,False,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,1
11,1,1,58.0,0,0,26.55,False,True,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1


##### Here we could find that we removed 1 newly added column from every previous columns who made dummies according to their variables (now columns in 25 , reduced 7 columns can also find that 7 columns participated in this field)

In [14]:
data_with_Dummies.shape

(181, 25)

### Label encoder

In [15]:
Here we are encoding a colum

SyntaxError: invalid syntax (486635987.py, line 1)

In [None]:
label = LabelEncoder()

In [None]:
data

In [None]:
data['EMBARKED'] = label.fit_transform(data.embarked)

In [None]:
data

In [None]:
data.embarked.unique()

##### Here we can find that the values 0,1,2 assigned according to the alphabetic order

In [None]:
data.EMBARKED.unique()

### Data Scaling or Data Normalising

### Mimax Scaler

In [None]:
minmax = MinMaxScaler()