In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Sample dataset
data = {
    'Age': [25, 30, np.nan, 35, 40, 20, 60, 55, 40, np.nan],
    'Salary': [50000, 60000, 55000, np.nan, 75000, 80000, 90000, 120000, 95000, 105000],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male'],
    'Experience': [1, 5, 3, 7, 10, 2, 12, 9, 8, 4],
    'Purchased': [1, 0, 1, 0, 1, 1, 1, 0, 1, 0]  # Target variable (binary classification)
}

df = pd.DataFrame(data)
df

Unnamed: 0,Age,Salary,Gender,Experience,Purchased
0,25.0,50000.0,Male,1,1
1,30.0,60000.0,Female,5,0
2,,55000.0,Female,3,1
3,35.0,,Male,7,0
4,40.0,75000.0,Female,10,1
5,20.0,80000.0,Female,2,1
6,60.0,90000.0,Male,12,1
7,55.0,120000.0,Male,9,0
8,40.0,95000.0,Female,8,1
9,,105000.0,Male,4,0


In [8]:
df.isna().sum()

Age           2
Salary        1
Gender        0
Experience    0
Purchased     0
dtype: int64

In [12]:

# 1. Data Cleaning: Handling Missing Values
# Fill missing values with the median for numerical columns
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [13]:
# Alternatively, you can drop rows with missing values
# df.dropna(inplace=True)

# 2. Removing Outliers
# Using Z-score to remove outliers
from scipy import stats
z_scores = np.abs(stats.zscore(df[['Age', 'Salary', 'Experience']]))
print(z_scores)
df = df[(z_scores < 3).all(axis=1)]  # Keep rows where z-score is less than 3

        Age    Salary  Experience
0  1.119899  1.462980    1.466753
1  0.689169  0.991051    0.316359
2  0.043073  1.227016    0.891556
3  0.258438  0.047193    0.258839
4  0.172292  0.283158    1.121635
5  1.550630  0.047193    1.179154
6  1.895214  0.424736    1.696832
7  1.464484  1.840524    0.834036
8  0.172292  0.660701    0.546437
9  0.043073  1.132630    0.603957


In [17]:
# 3. Feature Selection: Using SelectKBest to choose the most relevant features
X = df[['Age', 'Salary', 'Experience']]  # Features
y = df['Purchased']  # Target
selector = SelectKBest(score_func=f_classif, k=2)  # Selecting 2 best features
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
selected_features
print("Selected Features:", selected_features)

Selected Features: Index(['Age', 'Salary'], dtype='object')


In [None]:
# 4. Feature Creation: Creating a new feature (e.g., 'Age * Experience')
df['Age_Experience'] = df['Age'] - df['Experience']
df

Unnamed: 0,Age,Salary,Gender,Experience,Purchased,Age_Experience
0,25.0,50000.0,Male,1,1,25.0
1,30.0,60000.0,Female,5,0,150.0
2,37.5,55000.0,Female,3,1,112.5
3,35.0,80000.0,Male,7,0,245.0
4,40.0,75000.0,Female,10,1,400.0
5,20.0,80000.0,Female,2,1,40.0
6,60.0,90000.0,Male,12,1,720.0
7,55.0,120000.0,Male,9,0,495.0
8,40.0,95000.0,Female,8,1,320.0
9,37.5,105000.0,Male,4,0,150.0


In [22]:
# 5. Feature Encoding: Converting categorical variables to numeric
# Using LabelEncoder for the 'Gender' column
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

In [23]:
df

Unnamed: 0,Age,Salary,Gender,Experience,Purchased,Age_Experience
0,25.0,50000.0,1,1,1,25.0
1,30.0,60000.0,0,5,0,150.0
2,37.5,55000.0,0,3,1,112.5
3,35.0,80000.0,1,7,0,245.0
4,40.0,75000.0,0,10,1,400.0
5,20.0,80000.0,0,2,1,40.0
6,60.0,90000.0,1,12,1,720.0
7,55.0,120000.0,1,9,0,495.0
8,40.0,95000.0,0,8,1,320.0
9,37.5,105000.0,1,4,0,150.0


In [25]:
# 6. Scaling/Normalization: Standardizing features for model compatibility
scaler = StandardScaler()
df[['Age', 'Salary', 'Experience', 'Age_Experience']] = scaler.fit_transform(df[['Age', 'Salary', 'Experience', 'Age_Experience']])
df

Unnamed: 0,Age,Salary,Gender,Experience,Purchased,Age_Experience
0,-1.119899,-1.46298,1,-1.466753,1,-1.148382
1,-0.689169,-0.991051,0,-0.316359,0,-0.55213
2,-0.043073,-1.227016,0,-0.891556,1,-0.731006
3,-0.258438,-0.047193,1,0.258839,0,-0.098978
4,0.172292,-0.283158,0,1.121635,1,0.640375
5,-1.55063,-0.047193,0,-1.179154,1,-1.076832
6,1.895214,0.424736,1,1.696832,1,2.166782
7,1.464484,1.840524,1,0.834036,0,1.093527
8,0.172292,0.660701,0,0.546437,1,0.258774
9,-0.043073,1.13263,1,-0.603957,0,-0.55213


In [26]:
# Display the cleaned and processed data
print("\nProcessed Data:\n", df)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[['Age', 'Salary', 'Experience', 'Age_Experience', 'Gender']], y, test_size=0.3, random_state=42)

# Display the train and test data
print("\nTraining Features:\n", X_train)
print("\nTest Features:\n", X_test)


Processed Data:
         Age    Salary  Gender  Experience  Purchased  Age_Experience
0 -1.119899 -1.462980       1   -1.466753          1       -1.148382
1 -0.689169 -0.991051       0   -0.316359          0       -0.552130
2 -0.043073 -1.227016       0   -0.891556          1       -0.731006
3 -0.258438 -0.047193       1    0.258839          0       -0.098978
4  0.172292 -0.283158       0    1.121635          1        0.640375
5 -1.550630 -0.047193       0   -1.179154          1       -1.076832
6  1.895214  0.424736       1    1.696832          1        2.166782
7  1.464484  1.840524       1    0.834036          0        1.093527
8  0.172292  0.660701       0    0.546437          1        0.258774
9 -0.043073  1.132630       1   -0.603957          0       -0.552130

Training Features:
         Age    Salary  Experience  Age_Experience  Gender
0 -1.119899 -1.462980   -1.466753       -1.148382       1
7  1.464484  1.840524    0.834036        1.093527       1
2 -0.043073 -1.227016   -0.8