In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif

1. Preprocessing

In [None]:
# Sample data with missing values
data = {'y': [2, 4, 6, 8],
        'feature1': [1, 2, np.nan, 4],
        'feature2': [4, np.nan, 6, 7]}
df = pd.DataFrame(data)

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean')
# imputer = SimpleImputer(strategy='median')
# imputer = SimpleImputer(strategy='most_frequent')
# imputer = SimpleImputer(strategy='constant', fill_value=0)
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
df

In [None]:
# Remove rows with missing values
df_cleaned = df.dropna()

# Validate data types
df_cleaned.apply(pd.to_numeric, errors='coerce')

# Remove duplicates
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned.info()

2. Categorical Encoding

In [None]:
df_cleaned

In [None]:
# Sample categorical data
data = {'category': ['A', 'B', 'C', 'A']}
df = pd.DataFrame(data)
df = pd.concat([df_cleaned, df], axis=1)
# df = pd.concat([df_cleaned, df]) # Concatenate df and df_cleaned vertically
df

In [None]:
# One-Hot Encode categorical data
df = pd.get_dummies(df, columns=['category'])
df

In [None]:
boolean_columns = ['category_A', 'category_B', 'category_C']
df[boolean_columns] = df[boolean_columns].astype(int)
df

3. Feature Scaling

In [None]:
# Standardize features, normalize the range of independent variables or features of data
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df.iloc[:, :3]), columns=df.columns[:3])
df = pd.concat([df_scaled, df.iloc[:, 3:]], axis=1)
df

4. Feature Creation

In [None]:
# Create polynomial features, creates polynomial features (degree 2) from the existing features, which can help in capturing relationships between features.
poly = PolynomialFeatures(degree=2, include_bias=False)
df_poly = pd.DataFrame(poly.fit_transform(df.iloc[:, :3]), columns=poly.get_feature_names_out())
df_poly

In [None]:
df = pd.concat([df_poly, df.iloc[:, 3:]], axis=1)
df

In [None]:
# Manually create polynomial features (degree 2)
df['feature1_squared'] = df['feature1'] ** 2
df['feature2_squared'] = df['feature2'] ** 2
df['feature1_x_feature2'] = df['feature1'] * df['feature2']
df

5. Feature Selection

In [None]:
# Selecting the best 2 features based on the ANOVA F-value between feature and target.
# The F-value is a statistic used to compare the variances between different groups, 
# and in the context of feature selection, it assesses whether the mean of the target variable differs significantly across the different values of the feature.
# Determine which features show a strong statistical relationship with the target variable.
X = df.drop('y', axis=1)
y = df['y']
selector = SelectKBest(f_classif, k=2) 
X_new = selector.fit_transform(X, y)
X_new