"""<br>
    @Author: Deven Gupta<br>
    @Date: 25-09-2024<br>
    @Last Modified by: Deven Gupta<br>
    @Last Modified time: 25-09-2024<br>
    @Title : Python Program for Data Preprocessing<br>
<br>
"""

In [1]:
import numpy as np
import pandas as pd

1. Load the Dataset

In [34]:
url = 'https://drive.google.com/uc?id=1NKMy-zIT3tfpNLnA7G0EmPxgZe0OPXp_'
df = pd.read_csv(url)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


2. Handling Missing Data

In [35]:
# Option 1: Fill missing values (e.g., with the mean for numerical columns)
# Filling missing values for 'Age' with the mean and 'Salary' with the mean
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)


# # Option 2: Drop rows with missing values
# df.dropna(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)


In [36]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


3. Handling categorical data

In [29]:
# Other Methods

# from sklearn.preprocessing import OneHotEncoder,LabelEncoder

# #One Hot Encoding
# encoder = OneHotEncoder(sparse_output=False, drop='first')
# country_encoded = encoder.fit_transform(df[['Country']])
# country_encoded_df = pd.DataFrame(country_encoded, columns=encoder.get_feature_names_out(['Country']))
# df_final = pd.concat([df, country_encoded_df], axis=1).drop('Country', axis=1)


# # Labelled Enconding
# label_encoder = LabelEncoder()
# df['Purchased'] = label_encoder.fit_transform(df['Purchased'])
# df

In [37]:
# One-hot encoding for categorical columns
df = pd.get_dummies(df, columns=['Country', 'Purchased'], drop_first=True)
df

# # Laballed encoding for categorical columns
# df['Purchased_encode'], uniques = pd.factorize(df['Purchased'])
# df

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain,Purchased_Yes
0,44.0,72000.0,False,False,False
1,27.0,48000.0,False,True,True
2,30.0,54000.0,True,False,False
3,38.0,61000.0,False,True,False
4,40.0,63777.777778,True,False,True
5,35.0,58000.0,False,False,True
6,38.777778,52000.0,False,True,False
7,48.0,79000.0,False,False,True
8,50.0,83000.0,True,False,False
9,37.0,67000.0,False,False,True


4. Split the dataset into training set and test set


In [38]:
from sklearn.model_selection import train_test_split

In [39]:
# Split the dataset into training set and test set
X = df.drop('Purchased_Yes', axis=1)  # Features
y = df['Purchased_Yes']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
# Before Scaling
display(X_train)
display(X_test)

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain
5,35.0,58000.0,False,False
0,44.0,72000.0,False,False
7,48.0,79000.0,False,False
2,30.0,54000.0,True,False
9,37.0,67000.0,False,False
4,40.0,63777.777778,True,False
3,38.0,61000.0,False,True
6,38.777778,52000.0,False,True


Unnamed: 0,Age,Salary,Country_Germany,Country_Spain
8,50.0,83000.0,True,False
1,27.0,48000.0,False,True


5. Feature scaling

In [43]:
from sklearn.preprocessing import StandardScaler

In [44]:
# Step 4: Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [45]:
# After Scaling
display(X_train)
display(X_test)

array([[-0.7529426 , -0.62603778, -0.57735027, -0.57735027],
       [ 1.00845381,  1.01304295, -0.57735027, -0.57735027],
       [ 1.79129666,  1.83258331, -0.57735027, -0.57735027],
       [-1.73149616, -1.09434656,  1.73205081, -0.57735027],
       [-0.36152118,  0.42765698, -0.57735027, -0.57735027],
       [ 0.22561096,  0.05040824,  1.73205081, -0.57735027],
       [-0.16581046, -0.27480619, -0.57735027,  1.73205081],
       [-0.01359102, -1.32850095, -0.57735027,  1.73205081]])

array([[ 2.18271808,  2.30089209,  1.73205081, -0.57735027],
       [-2.3186283 , -1.79680973, -0.57735027,  1.73205081]])

In [46]:
# Display the processed data
print("Training Features:\n", X_train)
print("Test Features:\n", X_test)
print("Training Labels:\n", y_train)
print("Test Labels:\n", y_test)

Training Features:
 [[-0.7529426  -0.62603778 -0.57735027 -0.57735027]
 [ 1.00845381  1.01304295 -0.57735027 -0.57735027]
 [ 1.79129666  1.83258331 -0.57735027 -0.57735027]
 [-1.73149616 -1.09434656  1.73205081 -0.57735027]
 [-0.36152118  0.42765698 -0.57735027 -0.57735027]
 [ 0.22561096  0.05040824  1.73205081 -0.57735027]
 [-0.16581046 -0.27480619 -0.57735027  1.73205081]
 [-0.01359102 -1.32850095 -0.57735027  1.73205081]]
Test Features:
 [[ 2.18271808  2.30089209  1.73205081 -0.57735027]
 [-2.3186283  -1.79680973 -0.57735027  1.73205081]]
Training Labels:
 5     True
0    False
7     True
2    False
9     True
4     True
3    False
6    False
Name: Purchased_Yes, dtype: bool
Test Labels:
 8    False
1     True
Name: Purchased_Yes, dtype: bool
