<a href="https://colab.research.google.com/github/DevanshV03/Machine-Learning/blob/main/Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Libraries


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Importing the dataset


In [4]:
dataset = pd.read_csv("Data.csv") #Creating the dataframe of the csv dataset and storing it in a variable named dataset
#Creating the independent variable matrix "X"
X = dataset.iloc[:, :-1].values #The iloc function selects all the rows between ranges [:], and takes all the columns except the upper bound {last column in this case}, .values is used to extract the values of the dataset from all those selected columns.
#Creating a matrix of all the dependent variables
Y = dataset.iloc[:, -1].values

print(X)
print("-------------------------------------------------")
print(Y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
-------------------------------------------------
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Taking Care of missing data

In [5]:
#importing simpleimputer to handle missing values form sklearn
from sklearn.impute import SimpleImputer
#creating an object of the class SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="mean") #the missing values are defined as nan and will be replaced using the strategy mean
#using the fit method to fit the impute object to the matrix of features(X)
imputer.fit(X[:, 1:3]) #upper bound is ignored hence the range 1 to 3 is used
#using the transform method to replace the missing values with the mean values
X[:,1:3]= imputer.transform(X[:, 1:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


Encoding categorical data


In [13]:
#Encoding categorical data in the independent variables using compose class of skalearn from which ColumnTransformer is used
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder #Using the OneHotEncoder class to encode categorical data consisiting of several classes of categories
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder="passthrough") #ColumnTransformer class takes two inputs Transformers and remainder, Transformers takes argument as what to do that is encode, what type of encoding to do that is OneHotEncoding and the columns on which the encoding is to be done, the second input it takes is the remainder which decides what to do with the columns in which encoding is not applied, here in this case it is passthrough that is to keep them as is in the output and not transform them
X_transformed = ct.fit_transform(X)  #using the fit_transform method to apply the output of the column transformer to the new matrix of features, the np.array is used as column transformer does not give the output in the form of array and the machine learning model requires the input in the form of array.
X = np.array(X_transformed)
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [15]:
#Encoding the dependent variable using the class LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() #Label encoder is used when the categorical data consists only 2 unique types of values
Y = le.fit_transform(Y)
print(Y)

[0 1 0 0 1 1 0 1 0 1]


Splitting the dataset into training and test set

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y , test_size= 0.2, random_state= 1) #the random factor/state determines the randomness of the selection of the test and train sets if it is set to none it gives a different set of training and test set each time it is run, this is used when the stabiblity of the model is to be tested

In [17]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [18]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [19]:
print(Y_train)

[0 1 0 0 1 1 0 1]


In [20]:
print(Y_test)

[0 1]


Applying feature scaling on the dependent variables

In [21]:
from sklearn.preprocessing import StandardScaler #the standardscaler library applies standardization
sc = StandardScaler()
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.transform(X_test[:,3:]) #we only apply transform method so as to avoid creating a new scale of standardization based on the test set, we use only transform to apply the scale created using the training set

In [25]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [24]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
