In [25]:
# check if directory is correct or not
import os
print(os.listdir())

['.ipynb_checkpoints', 'CleanupData.csv', 'data preprocessing.ipynb']


In [3]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# plotting is visible
%matplotlib inline

In [7]:
# import dataset using pandas
data = pd.read_csv('CleanupData.csv')

In [8]:
data

Unnamed: 0,State,Age,Pocket Money,Course Purchased
0,Delhi,34.0,7200.0,No
1,Mumbai,17.0,4800.0,Yes
2,Banglore,20.0,5400.0,No
3,Mumbai,28.0,6100.0,No
4,Banglore,30.0,,Yes
5,Delhi,25.0,5800.0,Yes
6,Mumbai,,5200.0,No
7,Delhi,38.0,7900.0,Yes
8,Banglore,40.0,8300.0,No
9,Delhi,27.0,6700.0,Yes


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   State             10 non-null     object 
 1   Age               9 non-null      float64
 2   Pocket Money      9 non-null      float64
 3   Course Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [9]:
# get independent and dependent matrix out of dataset
X = data.iloc[: , :-1].values

In [10]:
# matrix of independent variable
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, nan],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', nan, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [11]:
# matrix of dependent variable
y = data.iloc[: , 3].values

In [12]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [13]:
# how to handle missing values

In [18]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [19]:
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, 6377.777777777777],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', 28.77777777777778, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [20]:
# handle categorical data in our data
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X

array([[1, 34.0, 7200.0],
       [2, 17.0, 4800.0],
       [0, 20.0, 5400.0],
       [2, 28.0, 6100.0],
       [0, 30.0, 6377.777777777777],
       [1, 25.0, 5800.0],
       [2, 28.77777777777778, 5200.0],
       [1, 38.0, 7900.0],
       [0, 40.0, 8300.0],
       [1, 27.0, 6700.0]], dtype=object)

In [24]:
# create dummy matrix for categorical dataset
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X)
np.set_printoptions(suppress=True)
X

array([[0.0, 1.0, 0.0, 1.0, 0.0, 34.0, 7200.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 17.0, 4800.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 20.0, 5400.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 28.0, 6100.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 30.0, 6377.777777777777],
       [0.0, 1.0, 0.0, 1.0, 0.0, 25.0, 5800.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 28.77777777777778, 5200.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 38.0, 7900.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 40.0, 8300.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 27.0, 6700.0]], dtype=object)

In [27]:
# handle y matrix for categorical data
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [30]:
# prepare test and training data set
# split out data into 2 parts, one will be training, and other will be test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
X_train

array([[0.0, 1.0, 0.0, 0.0, 1.0, 28.77777777777778, 5200.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 34.0, 7200.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 40.0, 8300.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 27.0, 6700.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 38.0, 7900.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 20.0, 5400.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 28.0, 6100.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 25.0, 5800.0]], dtype=object)

In [32]:
y_train

array([0, 0, 0, 1, 1, 0, 0, 1])

In [33]:
y_test

array([1, 1])