In [1]:
import csv

In [2]:
data = [
    ["Country", "Age", "Salary", "Purchased"],
    ["France", 44, 72000, "No"],
    ["Spain", 27, 48000, "Yes"],
    ["Germany", 30, 54000, "No"],
    ["Spain", 38, 61000, "No"],
    ["Germany", 40, None, "Yes"],  # Missing value for Salary
    ["France", 35, 58000, "Yes"],
    ["Spain", None, 52000, "No"],  # Missing value for Age
    ["France", 48, 79000, "Yes"],
    ["Germany", 50, 83000, "No"],
    ["France", 37, 67000, "Yes"]
]


In [1]:
filename = 'data.csv'

In [8]:
with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)

In [9]:
print(f"Data written to {filename}")

Data written to data.csv


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv('data.csv')

In [4]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [6]:
X = dataset.iloc[:,:-1].values # iloc[rows,columns]

In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [11]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [8]:
y = dataset.iloc[:,-1].values # Only we need final column

In [9]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [10]:
from sklearn.impute import SimpleImputer

In [12]:
imputer =  SimpleImputer(missing_values=np.nan, strategy= 'mean' )

In [13]:
imputer.fit(X[:,1:3])

In [14]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [15]:
X[:, 1:3] = imputer.transform(X[:,1:3])

In [16]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [18]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[0])], remainder = 'passthrough')

In [19]:
X = np.array(ct.fit_transform(X))

In [20]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
le = LabelEncoder() 

In [23]:
y = le.fit_transform(y)

In [24]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [25]:
X,y

(array([[1.0, 0.0, 0.0, 44.0, 72000.0],
        [0.0, 0.0, 1.0, 27.0, 48000.0],
        [0.0, 1.0, 0.0, 30.0, 54000.0],
        [0.0, 0.0, 1.0, 38.0, 61000.0],
        [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
        [1.0, 0.0, 0.0, 35.0, 58000.0],
        [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
        [1.0, 0.0, 0.0, 48.0, 79000.0],
        [0.0, 1.0, 0.0, 50.0, 83000.0],
        [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object),
 array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1]))

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [28]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [29]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
