In [1]:

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# Step 1: Import the dataset and show the table
data_set = pd.read_csv('DataLab1.csv')
print("Original dataset:")
print(data_set)

# Step 2: Separate the Independent and dependent variables
x = data_set.iloc[:, :-1].values
y = data_set.iloc[:, 3].values

# Step 3: Calculate the median for each feature or column that contains a missing value and replace the result for the missing value
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

# Step 4: Encode the categorical data
ct = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x), dtype=float)
y = LabelEncoder().fit_transform(y)

# Step 5: Splitting the dataset into the Training set and Test Set (assigning 30% for the test set)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Step 6: Apply Feature Scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Display the preprocessed dataset
preprocessed_data = pd.DataFrame(X_train)
preprocessed_data['Purchased'] = y_train
print("\nPreprocessed dataset:")
print(preprocessed_data.head())


Original dataset:
    Country   Age   Tall   Salary Purchased
0    France  44.0  187.0  72000.0        No
1     Spain  27.0  166.0  48000.0       Yes
2   Germany  30.0  169.0  54000.0        No
3        UK  57.0  171.0  73000.0       Yes
4   Germany  40.0  176.0      NaN       Yes
5    France  35.0    NaN  58000.0       Yes
6     Spain   NaN  159.0  52000.0        No
7    France  48.0  197.0  79000.0       Yes
8   Germany  50.0  165.0  83000.0        No
9    France  37.0  174.0  67000.0       Yes
10    Spain  38.0    NaN  61000.0        No
11       UK  41.0  167.0  71000.0        No
12       UK  35.0  178.0  72000.0       Yes
13  Germany  46.0  169.0  77000.0        No
14    Spain  32.0  170.0      NaN       Yes
15       UK  38.0  183.0  64000.0       Yes
16   France  56.0  168.0  83000.0        No

Preprocessed dataset:
          0         1         2         3         4         5         6  \
0 -0.755929  3.162278 -0.471405 -0.755929 -1.297594 -0.698702 -1.677291   
1 -0.755929 -0.31