In [1]:
# sklearn is a package which contains methods for data pre processing

In [2]:
import pandas as pd 

In [3]:
import numpy as np

In [4]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder , OneHotEncoder

In [5]:
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler , StandardScaler

In [6]:
# Handle Missing Values
from sklearn.impute import SimpleImputer

In [7]:
# Train Test Split
from sklearn.model_selection import train_test_split

In [8]:
data = {
    'gender' : ['m','f','m','m','m','f','m','m','m','f'],
    'country' : ['Ind','Aus','China','Aus','Ind','Aus','China','USA','Ind','China'],
    'age' : [20,25,26,None,56,65,19,47,24,20],
    'salary' : [57000,45000,56400,32000,78000,64000,None,40000,34000,45000]
}

In [9]:
df = pd.DataFrame(data)

In [10]:
df

Unnamed: 0,gender,country,age,salary
0,m,Ind,20.0,57000.0
1,f,Aus,25.0,45000.0
2,m,China,26.0,56400.0
3,m,Aus,,32000.0
4,m,Ind,56.0,78000.0
5,f,Aus,65.0,64000.0
6,m,China,19.0,
7,m,USA,47.0,40000.0
8,m,Ind,24.0,34000.0
9,f,China,20.0,45000.0


Handling missing values

In [11]:
# na - not available
# nan - not a number
pd.isna(df).sum()            #to check if there is None in our data or not

gender     0
country    0
age        1
salary     1
dtype: int64

In [12]:
df.fillna(0)

Unnamed: 0,gender,country,age,salary
0,m,Ind,20.0,57000.0
1,f,Aus,25.0,45000.0
2,m,China,26.0,56400.0
3,m,Aus,0.0,32000.0
4,m,Ind,56.0,78000.0
5,f,Aus,65.0,64000.0
6,m,China,19.0,0.0
7,m,USA,47.0,40000.0
8,m,Ind,24.0,34000.0
9,f,China,20.0,45000.0


In [13]:
np.nan

nan

In [14]:
imputer = SimpleImputer(missing_values=np.nan ,strategy = 'mean')    #call the imputer function/method itself

In [15]:
imputer_fit = imputer.fit(df.iloc[:,2:3])    #2 to 3 index          #fit the method 

In [16]:
imputer_fit.transform(df.iloc[:,2:3])               #apply it to our data frame

array([[20.        ],
       [25.        ],
       [26.        ],
       [33.55555556],
       [56.        ],
       [65.        ],
       [19.        ],
       [47.        ],
       [24.        ],
       [20.        ]])

In [17]:
df.iloc[:,2:3] = imputer.fit_transform(df.iloc[:,2:3])

In [18]:
df.iloc[:,3:] = imputer.fit_transform(df.iloc[:,3:])

In [19]:
df

Unnamed: 0,gender,country,age,salary
0,m,Ind,20.0,57000.0
1,f,Aus,25.0,45000.0
2,m,China,26.0,56400.0
3,m,Aus,33.555556,32000.0
4,m,Ind,56.0,78000.0
5,f,Aus,65.0,64000.0
6,m,China,19.0,50155.555556
7,m,USA,47.0,40000.0
8,m,Ind,24.0,34000.0
9,f,China,20.0,45000.0


In [20]:
df['age'].values.reshape(-1,1)

array([[20.        ],
       [25.        ],
       [26.        ],
       [33.55555556],
       [56.        ],
       [65.        ],
       [19.        ],
       [47.        ],
       [24.        ],
       [20.        ]])

In [21]:
# df.dropna()       #drops the na data

Label Encoding

In [22]:
df.head()

Unnamed: 0,gender,country,age,salary
0,m,Ind,20.0,57000.0
1,f,Aus,25.0,45000.0
2,m,China,26.0,56400.0
3,m,Aus,33.555556,32000.0
4,m,Ind,56.0,78000.0


In [23]:
label = LabelEncoder()          #calling the method
df['gender'] = label.fit_transform(df['gender'])             #applying the method

In [24]:
df.head()              # m =1 and f = 0

Unnamed: 0,gender,country,age,salary
0,1,Ind,20.0,57000.0
1,0,Aus,25.0,45000.0
2,1,China,26.0,56400.0
3,1,Aus,33.555556,32000.0
4,1,Ind,56.0,78000.0


In [25]:
country = label.fit_transform(df['country'])

In [26]:
country
# Ind - 2
# Aus - 0
# China -1 
# USA - 3 
#by default it assigns values alphabetically 

array([2, 0, 1, 0, 2, 0, 1, 3, 2, 1])

In [27]:
onehot = OneHotEncoder()
country = onehot.fit_transform(country.reshape(-1,1))     #if you want to make a sparse matrix 

In [28]:
country

<10x4 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [29]:
country.toarray()                #sparse matrix of 10 x 4 
                                # 1 means value and 0 means no value

array([[0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.]])

In [30]:
country.shape

(10, 4)

In [31]:
country.ndim

2

Feature scaling

In [32]:
# MinmaxScaler - Normalization
# StandardScaler - Standardization

In [33]:
df

Unnamed: 0,gender,country,age,salary
0,1,Ind,20.0,57000.0
1,0,Aus,25.0,45000.0
2,1,China,26.0,56400.0
3,1,Aus,33.555556,32000.0
4,1,Ind,56.0,78000.0
5,0,Aus,65.0,64000.0
6,1,China,19.0,50155.555556
7,1,USA,47.0,40000.0
8,1,Ind,24.0,34000.0
9,0,China,20.0,45000.0


In [34]:
df['age'].std()      #variation in our data

16.580518093440674

In [35]:
# for item in df['age']:
#     print(item)

In [36]:
for i in range(len(df)):
    df['age'].iloc[i] = df['age'].iloc[i] + 4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [37]:
df

Unnamed: 0,gender,country,age,salary
0,1,Ind,24.0,57000.0
1,0,Aus,29.0,45000.0
2,1,China,30.0,56400.0
3,1,Aus,37.555556,32000.0
4,1,Ind,60.0,78000.0
5,0,Aus,69.0,64000.0
6,1,China,23.0,50155.555556
7,1,USA,51.0,40000.0
8,1,Ind,28.0,34000.0
9,0,China,24.0,45000.0


In [38]:
df['age'].std() 

16.580518093440674

In [39]:
minmax = MinMaxScaler()
M = minmax.fit_transform(df.iloc[:,2:3])           #Scaled down our data

In [40]:
M      #data comes out to be between 0 and 1 both included

array([[0.02173913],
       [0.13043478],
       [0.15217391],
       [0.31642512],
       [0.80434783],
       [1.        ],
       [0.        ],
       [0.60869565],
       [0.10869565],
       [0.02173913]])

In [41]:
sc =  StandardScaler()
S = sc.fit_transform(df.iloc[:,2:3])

In [42]:
S

array([[-0.86178309],
       [-0.54391228],
       [-0.48033812],
       [ 0.        ],
       [ 1.42688676],
       [ 1.99905422],
       [-0.92535726],
       [ 0.8547193 ],
       [-0.60748644],
       [-0.86178309]])

Train Test Split

In [43]:
df

Unnamed: 0,gender,country,age,salary
0,1,Ind,24.0,57000.0
1,0,Aus,29.0,45000.0
2,1,China,30.0,56400.0
3,1,Aus,37.555556,32000.0
4,1,Ind,60.0,78000.0
5,0,Aus,69.0,64000.0
6,1,China,23.0,50155.555556
7,1,USA,51.0,40000.0
8,1,Ind,28.0,34000.0
9,0,China,24.0,45000.0


In [44]:
X = df[['gender','country','age']]
y = df['salary']

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [49]:
x_train.shape

(7, 3)

In [50]:
x_test.shape

(3, 3)

In [54]:
y_train.shape

(7,)

In [55]:
y_test.shape

(3,)

In [56]:
x_train

Unnamed: 0,gender,country,age
8,1,Ind,28.0
3,1,Aus,37.555556
6,1,China,23.0
0,1,Ind,24.0
2,1,China,30.0
9,0,China,24.0
1,0,Aus,29.0
