In [29]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif

1. Handling Missing Values

In [39]:
# Sample data with missing values
data = {'feature1': [1, 2, np.nan, 4],
        'feature2': [4, np.nan, 6, 7]}
df = pd.DataFrame(data)

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean')
# imputer = SimpleImputer(strategy='median')
# imputer = SimpleImputer(strategy='most_frequent')
# imputer = SimpleImputer(strategy='constant', fill_value=0)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
df_imputed


Unnamed: 0,feature1,feature2
0,1.0,4.0
1,2.0,5.666667
2,2.333333,6.0
3,4.0,7.0


2. Categorical Encoding

In [31]:
# Sample categorical data
data = {'category': ['A', 'B', 'C', 'A']}
df = pd.DataFrame(data)
df

Unnamed: 0,category
0,A
1,B
2,C
3,A


In [32]:
# One-Hot Encode categorical data
df_encoded = pd.get_dummies(df, columns=['category'])
df_encoded

Unnamed: 0,category_A,category_B,category_C
0,True,False,False
1,False,True,False
2,False,False,True
3,True,False,False


3. Feature Scaling

In [40]:
# Sample data
data = [[0, 10], [1, 20], [2, 30]]
df = pd.DataFrame(data, columns=['feature1', 'feature2'])

# Standardize features, normalize the range of independent variables or features of data
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

print(df_scaled)

   feature1  feature2
0 -1.224745 -1.224745
1  0.000000  0.000000
2  1.224745  1.224745


4. Feature Creation

In [34]:
# Sample data
data = {'feature1': [1, 2, 3, 4],
        'feature2': [4, 3, 2, 1]}
df = pd.DataFrame(data)

# Create polynomial features, creates polynomial features (degree 2) from the existing features, which can help in capturing relationships between features.
poly = PolynomialFeatures(degree=2, include_bias=False)
df_poly = pd.DataFrame(poly.fit_transform(df), columns=poly.get_feature_names_out())
df_poly

Unnamed: 0,feature1,feature2,feature1^2,feature1 feature2,feature2^2
0,1.0,4.0,1.0,4.0,16.0
1,2.0,3.0,4.0,6.0,9.0
2,3.0,2.0,9.0,6.0,4.0
3,4.0,1.0,16.0,4.0,1.0


In [35]:
# Manually create polynomial features (degree 2)
df['feature1_squared'] = df['feature1'] ** 2
df['feature2_squared'] = df['feature2'] ** 2
df['feature1_x_feature2'] = df['feature1'] * df['feature2']
df

Unnamed: 0,feature1,feature2,feature1_squared,feature2_squared,feature1_x_feature2
0,1,4,1,16,4
1,2,3,4,9,6
2,3,2,9,4,6
3,4,1,16,1,4


5. Feature Selection

In [36]:
# Sample data
X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
y = [0, 1, 0]

# Selecting the best 2 features based on the ANOVA F-value between feature and target.
# The F-value is a statistic used to compare the variances between different groups, 
# and in the context of feature selection, it assesses whether the mean of the target variable differs significantly across the different values of the feature.
# Determine which features show a strong statistical relationship with the target variable.
selector = SelectKBest(f_classif, k=2) 
X_new = selector.fit_transform(X, y)
X_new

array([[2, 3],
       [5, 6],
       [8, 9]])