In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# **Step 1: Create a Synthetic Dataset**
np.random.seed(42)
data = {
    'Feature1': np.random.normal(50, 10, 100),  # Numerical feature
    'Feature2': np.random.uniform(10, 100, 100),  # Numerical feature
    'Feature3': np.random.choice(['A', 'B', 'C'], size=100),  # Categorical feature
    'Feature4': np.random.randint(1, 100, 100),  # Numerical feature
    'Feature5': np.random.normal(100, 20, 100),  # Numerical feature
    'Target': np.random.choice([0, 1], size=100)  # Binary target
}

In [3]:
# Convert to DataFrame
df = pd.DataFrame(data)

In [4]:
# Introduce missing values
df.loc[np.random.choice(df.index, size=10, replace=False), 'Feature1'] = np.nan
df.loc[np.random.choice(df.index, size=5, replace=False), 'Feature3'] = np.nan

In [5]:
# Display the initial dataset
print("Original Dataset:")
print(df.head())

Original Dataset:
    Feature1   Feature2 Feature3  Feature4    Feature5  Target
0  54.967142  47.566990        B        68   86.620503       0
1  48.617357  29.989703        A         6   82.638315       1
2  56.476885  20.787883        B        96   89.099248       1
3  65.230299  40.385365        A        94   74.652354       1
4  47.658466  94.861873        C        47  102.205992       1


In [6]:
# **2. Handling Missing Values**
print("\nHandling Missing Values:")
# Impute numerical features with mean
num_imputer = SimpleImputer(strategy='mean')
df[['Feature1', 'Feature2', 'Feature4', 'Feature5']] = num_imputer.fit_transform(df[['Feature1', 'Feature2', 'Feature4', 'Feature5']])


Handling Missing Values:


In [7]:
# Impute categorical features with mode
cat_imputer = SimpleImputer(strategy='most_frequent')
df[['Feature3']] = cat_imputer.fit_transform(df[['Feature3']])

print(df.head())

    Feature1   Feature2 Feature3  Feature4    Feature5  Target
0  54.967142  47.566990        B      68.0   86.620503       0
1  48.617357  29.989703        A       6.0   82.638315       1
2  56.476885  20.787883        B      96.0   89.099248       1
3  65.230299  40.385365        A      94.0   74.652354       1
4  47.658466  94.861873        C      47.0  102.205992       1


In [8]:
# **3. Scaling Data**
print("\nScaling Data:")
numerical_features = ['Feature1', 'Feature2', 'Feature4', 'Feature5']


Scaling Data:


In [9]:
# Standardization
scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features)

In [10]:
# Normalization
min_max_scaler = MinMaxScaler()
df_normalized = pd.DataFrame(min_max_scaler.fit_transform(df[numerical_features]), columns=numerical_features)


In [11]:
print("\nStandardized Data:")
print(df_standardized.head())

print("\nNormalized Data:")
print(df_normalized.head())



Standardized Data:
   Feature1  Feature2  Feature4  Feature5
0  0.705559 -0.239956  0.595575 -0.563783
1 -0.019816 -0.921168 -1.484743 -0.754549
2  0.878027 -1.277786  1.535073 -0.445038
3  1.877983 -0.518281  1.467966 -1.137116
4 -0.129356  1.592969 -0.109049  0.182840

Normalized Data:
   Feature1  Feature2  Feature4  Feature5
0  0.696879  0.420512  0.683673  0.325758
1  0.554890  0.221343  0.051020  0.291537
2  0.730639  0.117076  0.969388  0.347059
3  0.926376  0.339137  0.948980  0.222909
4  0.533448  0.956413  0.469388  0.459693


In [12]:
# **4. Handling Noise**
print("\nHandling Noise:")
# Inject random noise into Feature1
df['Feature1_noisy'] = df['Feature1'] + np.random.normal(0, 5, size=df.shape[0])


Handling Noise:


In [13]:
# Smooth the noisy feature using a rolling mean
df['Feature1_smoothed'] = df['Feature1_noisy'].rolling(window=5).mean().fillna(df['Feature1_noisy'])

In [14]:
print(df[['Feature1_noisy', 'Feature1_smoothed']].head())

   Feature1_noisy  Feature1_smoothed
0       58.170052          58.170052
1       44.543436          44.543436
2       65.711787          65.711787
3       66.560075          66.560075
4       46.591103          56.315291


In [15]:
# **5. Handling Outliers**
print("\nHandling Outliers:")
# Detect outliers using Z-score
z_scores = np.abs((df[numerical_features] - df[numerical_features].mean()) / df[numerical_features].std())
outliers = z_scores > 3  # Z-score threshold
print("\nOutliers Detected:")
print(outliers.sum())


Handling Outliers:

Outliers Detected:
Feature1    0
Feature2    0
Feature4    0
Feature5    1
dtype: int64


In [16]:
# Remove outliers
df_no_outliers = df[(z_scores <= 3).all(axis=1)]
print("\nData After Removing Outliers:")
print(df_no_outliers.shape)


Data After Removing Outliers:
(99, 8)


In [17]:
# **6. Feature Selection**
print("\nFeature Selection:")
X = df.drop(columns=['Target', 'Feature1_noisy', 'Feature1_smoothed'])
y = df['Target']


Feature Selection:


In [18]:
# Filter Methods: Mutual Information
mi_scores = mutual_info_classif(X.select_dtypes(include=['float64', 'int64']), y)
mi_scores = pd.Series(mi_scores, index=X.select_dtypes(include=['float64', 'int64']).columns)
print("\nMutual Information Scores:")
print(mi_scores.sort_values(ascending=False))


Mutual Information Scores:
Feature5    0.022682
Feature2    0.021619
Feature1    0.000000
Feature4    0.000000
dtype: float64


In [20]:
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns

# Fit RFE using only the numerical features
model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=3)
rfe.fit(X[numerical_features], y)

# Print the selected features
print("\nSelected Features by RFE:")
print(numerical_features[rfe.support_])



Selected Features by RFE:
Index(['Feature1', 'Feature2', 'Feature5'], dtype='object')


In [21]:
# Embedded Methods: Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X.select_dtypes(include=['float64', 'int64']), y)
lasso_coefficients = pd.Series(lasso.coef_, index=X.select_dtypes(include=['float64', 'int64']).columns)
print("\nLasso Coefficients:")
print(lasso_coefficients[lasso_coefficients != 0].sort_values(ascending=False))


Lasso Coefficients:
Feature1    0.002519
Feature5    0.002516
Feature2    0.002454
Feature4    0.000269
dtype: float64
