In [2]:
import pandas as pd

df = pd.read_csv("flights.csv")
df

Unnamed: 0,year,month,passengers
0,1949,January,112.0
1,1949,February,118.0
2,1949,March,132.0
3,1949,April,129.0
4,1949,May,121.0
...,...,...,...
139,1960,August,606.0
140,1960,September,508.0
141,1960,October,461.0
142,1960,November,390.0


In [3]:
#Dropping the null entries (rows)
df_dropped_rows = df.dropna()
print("After dropping null entries (rows),\n", df_dropped_rows)

After dropping null entries (rows),
      year      month  passengers
0    1949    January       112.0
1    1949   February       118.0
2    1949      March       132.0
3    1949      April       129.0
4    1949        May       121.0
..    ...        ...         ...
139  1960     August       606.0
140  1960  September       508.0
141  1960    October       461.0
142  1960   November       390.0
143  1960   December       432.0

[136 rows x 3 columns]


In [4]:
#Dropping the null entires (col)
df_dropped_cols = df.dropna(axis=1)

print("After dropping null entries (cols),\n", df_dropped_cols)

After dropping null entries (cols),
      year
0    1949
1    1949
2    1949
3    1949
4    1949
..    ...
139  1960
140  1960
141  1960
142  1960
143  1960

[144 rows x 1 columns]


In [8]:
#Filling with Simple Impute - numerical values
from sklearn.impute import SimpleImputer, KNNImputer
imputer_mean= SimpleImputer(strategy='mean')
df_fill_si= df.copy()

df_fill_si[['passengers']] = imputer_mean.fit_transform(df_fill_si[['passengers']])

imputer_mode=SimpleImputer(strategy='most_frequent')
df_fill_si[['month']] = imputer_mode.fit_transform(df_fill_si[['month']])

df[['month']]=df_fill_si[['month']]

df_fill_si.dropna()

Unnamed: 0,year,month,passengers
0,1949,January,112.0
1,1949,February,118.0
2,1949,March,132.0
3,1949,April,129.0
4,1949,May,121.0
...,...,...,...
139,1960,August,606.0
140,1960,September,508.0
141,1960,October,461.0
142,1960,November,390.0


In [9]:
#Filling with KNN Impute
df_fill_knn = df.copy()
knn_imputer = KNNImputer(n_neighbors=4)

df_fill_knn[['passengers']] =knn_imputer.fit_transform(df_fill_knn[['passengers']])

df_fill_knn.dropna()

Unnamed: 0,year,month,passengers
0,1949,January,112.0
1,1949,February,118.0
2,1949,March,132.0
3,1949,April,129.0
4,1949,May,121.0
...,...,...,...
139,1960,August,606.0
140,1960,September,508.0
141,1960,October,461.0
142,1960,November,390.0


In [10]:
#Fill with FFill/BFill
df_ffill = df.fillna(method='ffill')
df_bfill = df.fillna(method='bfill')
df_ffill

  df_ffill = df.fillna(method='ffill')
  df_bfill = df.fillna(method='bfill')


Unnamed: 0,year,month,passengers
0,1949,January,112.0
1,1949,February,118.0
2,1949,March,132.0
3,1949,April,129.0
4,1949,May,121.0
...,...,...,...
139,1960,August,606.0
140,1960,September,508.0
141,1960,October,461.0
142,1960,November,390.0


In [11]:
df_bfill

Unnamed: 0,year,month,passengers
0,1949,January,112.0
1,1949,February,118.0
2,1949,March,132.0
3,1949,April,129.0
4,1949,May,121.0
...,...,...,...
139,1960,August,606.0
140,1960,September,508.0
141,1960,October,461.0
142,1960,November,390.0


In [12]:
#Reading tips dataset for feature scaling, encoding and binning
df_tips = pd.read_csv("tips.csv")
df_tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [13]:
#Using Normalization
from sklearn.preprocessing import MinMaxScaler
minmaxScaler = MinMaxScaler()
scaled_data_mm = df_tips.copy()
scaled_data_mm[['total_bill','tip']] = minmaxScaler.fit_transform(df_tips[['total_bill','tip']])
scaled_data_mm

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,0.291579,0.001111,Female,No,Sun,Dinner,2
1,0.152283,0.073333,Male,No,Sun,Dinner,3
2,0.375786,0.277778,Male,No,Sun,Dinner,3
3,0.431713,0.256667,Male,No,Sun,Dinner,2
4,0.450775,0.290000,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,0.543779,0.546667,Male,No,Sat,Dinner,3
240,0.505027,0.111111,Female,Yes,Sat,Dinner,2
241,0.410557,0.111111,Male,Yes,Sat,Dinner,2
242,0.308965,0.083333,Male,No,Sat,Dinner,2


In [14]:
#Using STandardization
from sklearn.preprocessing import StandardScaler
StdScaler = StandardScaler()

scaled_data_std = df_tips.copy()
scaled_data_std[['total_bill','tip']] = StdScaler.fit_transform(df_tips[['total_bill','tip']])

scaled_data_std

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,-0.314711,-1.439947,Female,No,Sun,Dinner,2
1,-1.063235,-0.969205,Male,No,Sun,Dinner,3
2,0.137780,0.363356,Male,No,Sun,Dinner,3
3,0.438315,0.225754,Male,No,Sun,Dinner,2
4,0.540745,0.443020,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,1.040511,2.115963,Male,No,Sat,Dinner,3
240,0.832275,-0.722971,Female,Yes,Sat,Dinner,2
241,0.324630,-0.722971,Male,Yes,Sat,Dinner,2
242,-0.221287,-0.904026,Male,No,Sat,Dinner,2


In [15]:
#Usin L2 Normalization
from sklearn.preprocessing import Normalizer

l2Norm = Normalizer(norm='l2')
scaled_data_l2 = df_tips.copy()
scaled_data_l2[['total_bill','tip']] = l2Norm.fit_transform(df_tips[['total_bill','tip']])

scaled_data_l2

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,0.998238,0.059342,Female,No,Sun,Dinner,2
1,0.987357,0.158512,Male,No,Sun,Dinner,3
2,0.986407,0.164323,Male,No,Sun,Dinner,3
3,0.990372,0.138435,Male,No,Sun,Dinner,2
4,0.989395,0.145251,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,0.979834,0.199815,Male,No,Sat,Dinner,3
240,0.997304,0.073385,Female,Yes,Sat,Dinner,2
241,0.996131,0.087881,Male,Yes,Sat,Dinner,2
242,0.995213,0.097734,Male,No,Sat,Dinner,2


In [16]:
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()

lb_encoded_df = df_tips.copy()
lb_encoded_df['sex'] = lb_encoder.fit_transform(df_tips['sex'])
lb_encoded_df['day'] = lb_encoder.fit_transform(df_tips['day'])
lb_encoded_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,No,2,Dinner,2
1,10.34,1.66,1,No,2,Dinner,3
2,21.01,3.5,1,No,2,Dinner,3
3,23.68,3.31,1,No,2,Dinner,2
4,24.59,3.61,0,No,2,Dinner,4


In [17]:
from sklearn.preprocessing import OrdinalEncoder
od_encoder = OrdinalEncoder(categories=[['Lunch','Dinner']])
od_encoded_df = df_tips.copy()
od_encoded_df[['time']] = od_encoder.fit_transform(df_tips[['time']])

od_encoded_df


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,1.0,2
1,10.34,1.66,Male,No,Sun,1.0,3
2,21.01,3.50,Male,No,Sun,1.0,3
3,23.68,3.31,Male,No,Sun,1.0,2
4,24.59,3.61,Female,No,Sun,1.0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,1.0,3
240,27.18,2.00,Female,Yes,Sat,1.0,2
241,22.67,2.00,Male,Yes,Sat,1.0,2
242,17.82,1.75,Male,No,Sat,1.0,2


In [18]:
#one hot encoding
pd.get_dummies(df['month'])

Unnamed: 0,April,August,December,February,January,July,June,March,May,November,October,September
0,False,False,False,False,True,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
139,False,True,False,False,False,False,False,False,False,False,False,False
140,False,False,False,False,False,False,False,False,False,False,False,True
141,False,False,False,False,False,False,False,False,False,False,True,False
142,False,False,False,False,False,False,False,False,False,True,False,False


In [19]:
#Equal width binning
df['relative_period'] = pd.cut(df['year'],bins=4,labels=['Long Long Ago', 'Long ago','Somewhat recent','Very Recent']) 

In [20]:
df

Unnamed: 0,year,month,passengers,relative_period
0,1949,January,112.0,Long Long Ago
1,1949,February,118.0,Long Long Ago
2,1949,March,132.0,Long Long Ago
3,1949,April,129.0,Long Long Ago
4,1949,May,121.0,Long Long Ago
...,...,...,...,...
139,1960,August,606.0,Very Recent
140,1960,September,508.0,Very Recent
141,1960,October,461.0,Very Recent
142,1960,November,390.0,Very Recent
