In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('Data.csv') 
print(data)

# 1. Handle Missing Values
print("\n\nBefore handling missing values:\n", data.isnull().sum()) 


    Country   Age   Salary Purchased
0    France  44.0  72000.0        No
1     Spain  27.0  48000.0       Yes
2   Germany  30.0  54000.0        No
3     Spain  38.0  61000.0        No
4   Germany  40.0      NaN       Yes
5    France  35.0  58000.0       Yes
6     Spain   NaN  52000.0        No
7    France  48.0  79000.0       NaN
8   Germany  50.0  83000.0        No
9    France  37.0  67000.0       Yes
10   France   NaN  72000.0        No
11    Spain  27.0  48000.0       Yes
12  Germany  30.0  54000.0        No
13    Spain   NaN  61000.0        No
14  Germany  40.0     21.0       Yes
15   France  35.0  58000.0       Yes
16    Spain   NaN  52000.0        No
17   France  48.0      NaN       NaN
18  Germany  50.0  83000.0        No
19   France  37.0  67000.0       Yes


Before handling missing values:
 Country      0
Age          4
Salary       2
Purchased    2
dtype: int64


In [2]:

# Fill missing values with the mean of the 'Salary' column without using inplace
data['Salary'] = data['Salary'].fillna(data['Salary'].mean())

print("\nAfter handling missing values:\n", data.isnull().sum())



After handling missing values:
 Country      0
Age          4
Salary       0
Purchased    2
dtype: int64


In [3]:
# 2. Handle Outliers (Example: Using IQR)
Q1 = data['Salary'].quantile(0.25)
Q3 = data['Salary'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [4]:
# Filter out outliers
data = data[(data['Salary'] >= lower_bound) & (data['Salary'] <= upper_bound)]

In [5]:
# 4. Feature Scaling
scaler = StandardScaler()
data['Salary'] = scaler.fit_transform(data[['Salary']])

In [6]:
# 5. Split Data (If necessary)
X = data[['Salary']]  # Features
y = data['Purchased']  # Target variable

In [7]:
# Print the preprocessed data
print("\nPreprocessed Data:")
print(data.head())


Preprocessed Data:
   Country   Age    Salary Purchased
0   France  44.0  0.884790        No
1    Spain  27.0 -1.353944       Yes
2  Germany  30.0 -0.794260        No
3    Spain  38.0 -0.141296        No
4  Germany  40.0 -0.291473       Yes
