Importing the Libraries

In [1]:
# To work with arrays and mathimatical operations we use Numpy
import numpy as np
# To plot the data on graphs,charts,etc we use Matplotlib
import matplotlib.pyplot as plt
# To work with dataframes, import dataset, matrix of features and Dependent Variable vector we use Pandas
import pandas as pd

Importing The Dataset

In [2]:
# TO read the csv file and store the excel data in a dataframe
dataset = pd.read_csv(r'C:\Users\avira\Documents\Restart_Skills_v2025\GitHub\Machine_Learning_AI_Python_ChatGPT_dataset_venv\datasets\Part 1 - Data Preprocessing\Data.csv')

X = dataset.iloc[:, :-1].values  # Features (all the values except the last column)
y = dataset.iloc[:, -1].values  # Target variable (all the values in the last column)

In [3]:
print("X:- ", X)

X:-  [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print("Y:- ", y)

Y:-  ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Taking Care of Missing Data

In [5]:
# To handle missing data we use SimpleImputer from sklearn.impute. In our case, it will replace the missing values with the mean of the column.
# Importing SimpleImputer class
from sklearn.impute import SimpleImputer

In [6]:
# Creating an object and calling the class from the object
# Claryifing that the missing values are represented as 'NaN' and we want to replace them with the mean of the column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Calling the function of the class SImpleImputer and fitting the imputer object to the columns with missing data (in our case, columns 1 and 2)
# We need to make sure that the values given to the fit function are numerical and not categorical
imputer.fit(X[: , 1:3])

# After fitting the imputer object to the columns with missing data, we transform the data in those columns
# This will replace the missing values with the mean of the column
X[: , 1:3] = imputer.transform(X[: , 1:3])

In [7]:
print("X after imputer:- ", X)

X after imputer:-  [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


Encoding the Categorical Data

Encoding the Independent Variable


In [None]:
# To handle categorical data we use OneHotEncoder from sklearn.preprocessing
# Importing ColumnTransformer class and OneHotEncoder class
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [9]:
# Creating an object of Column Transformer and passing the self parameters in it.
# We are specifying that we want to apply OneHotEncoder to the first column (index 0) of X
# The remainder='passthrough' parameter means that we want to keep the other columns as they are
# This will create new binary columns for each category in the first column and drop the original column
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X)

In [10]:
# The Model is trained on numpy arrays, so we convert the X to numpy array
X = np.array(X)

In [11]:
print("X after ColumnTransformer:- ", X)

X after ColumnTransformer:-  [[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


Encoding the Dependent Variable

In [12]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Creating a class object of LabelEncoder to encode the Dependent Variable (y)
temp = LabelEncoder()

# The output is a 1D array, so we can directly fit and transform y.
# We do not need to numpy array conversion since the output is already a numpy array
y = temp.fit_transform(y)


In [14]:
print("y after LabelEncoder:- ", y)

y after LabelEncoder:-  [0 1 0 0 1 1 0 1 0 1]


Splitting the Dataset into Training and Testing set

In [15]:
# Importing the function to splithe data into training and testing set.

from sklearn.model_selection import train_test_split

In [16]:
# The function takes 4 parameters:
# 1. X: The matrix of features
# 2. y: The Dependent Variable vector
# 3. test_size: The proportion of the dataset to include in the test split (
#    typically between 0.2 and 0.4). We usually use 0.2 (20% of the data for testing and 80% for training)
# 4. random_state: This is used to ensure that the results are reproducible. It is an integer value that is used as a seed for the random number generator.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [17]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [18]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [19]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [21]:
print(y_test)

[0 1]


Feature Scaling

In [None]:

# Feature Scaling
# Importing StandardScaler class from sklearn.preprocessing

from sklearn.preprocessing import StandardScaler

In [None]:

# Creting a StandardScaler object
sc = StandardScaler()

# Fitting the StandardScaler object to the training set and transforming the training set
# We are applying feature scaling to all the columns except the first 3 columns (which are the one-hot encoded columns)
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])