# CHAPTER-1 DATA PREPROCESSING

## Loading The Data

In [19]:
import numpy as np
import pandas as pd
dataset = pd.read_csv('testing.csv')
dataset.replace(r'^\s*$', np.nan, regex=True, inplace=True)
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,82000.0,No
9,France,37.0,67000.0,Yes


In [20]:
X=dataset.iloc[:, :-1].values
y=dataset.iloc[:, -1].values

In [21]:
X

array([['France', '44', '72000'],
       [' Spain', '27', '48000'],
       ['Germany', '30', '54000'],
       [' Spain', '38', '61000'],
       ['Germany', '40', nan],
       [' France', '35', '58000'],
       [' Spain', nan, '52000'],
       [' France', '48', '79000'],
       [' Germany', '50', '82000'],
       [' France', '37', '67000']], dtype=object)

In [5]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Missing Data

#### We import one library "sklearn" which contains all the tools of machine learning

In [22]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan , strategy ='mean')

In [23]:
imputer.fit(X[:, 1:3])

In [24]:
X[:, 1:3]=imputer.transform(X[: ,1:3])

In [25]:
X

array([['France', 44.0, 72000.0],
       [' Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       [' Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63666.666666666664],
       [' France', 35.0, 58000.0],
       [' Spain', 38.77777777777778, 52000.0],
       [' France', 48.0, 79000.0],
       [' Germany', 50.0, 82000.0],
       [' France', 37.0, 67000.0]], dtype=object)

# Encoding Categorical Data

## independent Variables


In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [29]:
ct = ColumnTransformer(transformers =[( 'encoder' , OneHotEncoder() , [0] )],remainder= 'passthrough')

In [36]:
X = np.array(ct.fit_transform(X))

In [35]:
X

array([[0.0, 0.0, 0.0, 1.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 27.0, 48000.0],
       [0.0, 0.0, 0.0, 0.0, 1.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 38.0, 61000.0],
       [0.0, 0.0, 0.0, 0.0, 1.0, 40.0, 63666.666666666664],
       [1.0, 0.0, 0.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 0.0, 0.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 50.0, 82000.0],
       [1.0, 0.0, 0.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

**Here we can observe that the Alphabetic Word values are converted to Numeric values( France , Spain , Germany )**

# Converting Dependent Variable

In [40]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y=le.fit_transform(y)

In [41]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

**Usually the model always gives more weithage to higher value let say row 1 in X have 72000 thus will have a higher weithage 
but in a good model the weithage should be distributed equally**

### Thus for doing this we use the concept of FEATURE SCALING

## DATASET SPLIT

In [45]:
from sklearn.model_selection import train_test_split

In [48]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

**Here the train values are used to train and test for test for X & y respectively where as test_size reffers to the amount
of testing data to be used i.e 0.2 = 20% random_state shufffles the data randomly**

In [49]:
X_train

array([[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 44.0, 72000.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 37.0, 67000.0],
       [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 40.0, 63666.666666666664],
       [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 38.77777777777778, 52000.0]],
      dtype=object)

In [50]:
X_test

array([[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 50.0, 82000.0],
       [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 27.0, 48000.0]], dtype=object)

In [52]:
y_train

array([1, 0, 1, 0, 1, 1, 0, 0])

In [53]:
y_test

array([0, 1])

# Feature Scaling

In [54]:
from sklearn.preprocessing import StandardScaler

In [55]:
sc = StandardScaler()

In [57]:
X_train[:, 3:]=sc.fit_transform(X_train[: ,3:])

In [58]:
X_test[:, 3:]=sc.fit_transform(X_test[: ,3:])

In [59]:
X_train

array([[0.0, 1.0, 0.0, -0.5773502691896258, -0.37796447300922725,
        -0.5773502691896258, -0.7529426005471074, -0.6244571138007717],
       [1.0, 0.0, 0.0, -0.5773502691896258, 2.645751311064591,
        -0.5773502691896258, 1.008453807952985, 1.0147428099262543],
       [0.0, 1.0, 0.0, -0.5773502691896258, -0.37796447300922725,
        -0.5773502691896258, 1.7912966561752484, 1.8343427717897671],
       [1.0, 0.0, 0.0, -0.5773502691896258, -0.37796447300922725,
        1.7320508075688774, -1.7314961608249366, -1.0927999491513505],
       [0.0, 1.0, 0.0, -0.5773502691896258, -0.37796447300922725,
        -0.5773502691896258, -0.3615211764359758, 0.4293142657380307],
       [1.0, 0.0, 0.0, -0.5773502691896258, -0.37796447300922725,
        1.7320508075688774, 0.22561095973072173, 0.03902856961254803],
       [1.0, 0.0, 0.0, 1.7320508075688774, -0.37796447300922725,
        -0.5773502691896258, -0.16581046438040992, -0.2731999872878376],
       [1.0, 0.0, 0.0, 1.7320508075688774, -0

In [60]:
X_test

array([[1.0, 0.0, 1.0, -1.0, 0.0, 0.0, 1.0, 1.0],
       [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, -1.0, -1.0]], dtype=object)

**Thus the models are equally Scaled**