In [182]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif

1. Preprocessing

In [183]:
# Sample data with missing values
data = {'y': [2, 4, 6, 8],
        'feature1': [1, 2, np.nan, 4],
        'feature2': [4, np.nan, 6, 7]}
df = pd.DataFrame(data)

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean')
# imputer = SimpleImputer(strategy='median')
# imputer = SimpleImputer(strategy='most_frequent')
# imputer = SimpleImputer(strategy='constant', fill_value=0)
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
df


Unnamed: 0,y,feature1,feature2
0,2.0,1.0,4.0
1,4.0,2.0,5.666667
2,6.0,2.333333,6.0
3,8.0,4.0,7.0


In [184]:
# Remove rows with missing values
df_cleaned = df.dropna()

# Validate data types
df_cleaned.apply(pd.to_numeric, errors='coerce')

# Remove duplicates
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   y         4 non-null      float64
 1   feature1  4 non-null      float64
 2   feature2  4 non-null      float64
dtypes: float64(3)
memory usage: 224.0 bytes


2. Categorical Encoding

In [185]:
df_cleaned

Unnamed: 0,y,feature1,feature2
0,2.0,1.0,4.0
1,4.0,2.0,5.666667
2,6.0,2.333333,6.0
3,8.0,4.0,7.0


In [186]:
# Sample categorical data
data = {'category': ['A', 'B', 'C', 'A']}
df = pd.DataFrame(data)
df = pd.concat([df_cleaned, df], axis=1)
# df = pd.concat([df_cleaned, df]) # Concatenate df and df_cleaned vertically
df

Unnamed: 0,y,feature1,feature2,category
0,2.0,1.0,4.0,A
1,4.0,2.0,5.666667,B
2,6.0,2.333333,6.0,C
3,8.0,4.0,7.0,A


In [187]:
# One-Hot Encode categorical data
df = pd.get_dummies(df, columns=['category'])
df

Unnamed: 0,y,feature1,feature2,category_A,category_B,category_C
0,2.0,1.0,4.0,True,False,False
1,4.0,2.0,5.666667,False,True,False
2,6.0,2.333333,6.0,False,False,True
3,8.0,4.0,7.0,True,False,False


In [188]:
boolean_columns = ['category_A', 'category_B', 'category_C']
df[boolean_columns] = df[boolean_columns].astype(int)
df

Unnamed: 0,y,feature1,feature2,category_A,category_B,category_C
0,2.0,1.0,4.0,1,0,0
1,4.0,2.0,5.666667,0,1,0
2,6.0,2.333333,6.0,0,0,1
3,8.0,4.0,7.0,1,0,0


3. Feature Scaling

In [189]:
# Standardize features, normalize the range of independent variables or features of data
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df.iloc[:, :3]), columns=df.columns[:3])
df = pd.concat([df_scaled, df.iloc[:, 3:]], axis=1)
df

Unnamed: 0,y,feature1,feature2,category_A,category_B,category_C
0,-1.341641,-1.234427,-1.543033,1,0,0
1,-0.447214,-0.308607,0.0,0,1,0
2,0.447214,0.0,0.308607,0,0,1
3,1.341641,1.543033,1.234427,1,0,0


4. Feature Creation

In [190]:
# Create polynomial features, creates polynomial features (degree 2) from the existing features, which can help in capturing relationships between features.
poly = PolynomialFeatures(degree=2, include_bias=False)
df_poly = pd.DataFrame(poly.fit_transform(df.iloc[:, :3]), columns=poly.get_feature_names_out())
df_poly

Unnamed: 0,y,feature1,feature2,y^2,y feature1,y feature2,feature1^2,feature1 feature2,feature2^2
0,-1.341641,-1.234427,-1.543033,1.8,1.656157,2.070197,1.52381,1.904762,2.380952
1,-0.447214,-0.308607,0.0,0.2,0.138013,-0.0,0.095238,-0.0,0.0
2,0.447214,0.0,0.308607,0.2,0.0,0.138013,0.0,0.0,0.095238
3,1.341641,1.543033,1.234427,1.8,2.070197,1.656157,2.380952,1.904762,1.52381


In [191]:
df = pd.concat([df_poly, df.iloc[:, 3:]], axis=1)
df

Unnamed: 0,y,feature1,feature2,y^2,y feature1,y feature2,feature1^2,feature1 feature2,feature2^2,category_A,category_B,category_C
0,-1.341641,-1.234427,-1.543033,1.8,1.656157,2.070197,1.52381,1.904762,2.380952,1,0,0
1,-0.447214,-0.308607,0.0,0.2,0.138013,-0.0,0.095238,-0.0,0.0,0,1,0
2,0.447214,0.0,0.308607,0.2,0.0,0.138013,0.0,0.0,0.095238,0,0,1
3,1.341641,1.543033,1.234427,1.8,2.070197,1.656157,2.380952,1.904762,1.52381,1,0,0


In [192]:
# Manually create polynomial features (degree 2)
df['feature1_squared'] = df['feature1'] ** 2
df['feature2_squared'] = df['feature2'] ** 2
df['feature1_x_feature2'] = df['feature1'] * df['feature2']
df

Unnamed: 0,y,feature1,feature2,y^2,y feature1,y feature2,feature1^2,feature1 feature2,feature2^2,category_A,category_B,category_C,feature1_squared,feature2_squared,feature1_x_feature2
0,-1.341641,-1.234427,-1.543033,1.8,1.656157,2.070197,1.52381,1.904762,2.380952,1,0,0,1.52381,2.380952,1.904762
1,-0.447214,-0.308607,0.0,0.2,0.138013,-0.0,0.095238,-0.0,0.0,0,1,0,0.095238,0.0,-0.0
2,0.447214,0.0,0.308607,0.2,0.0,0.138013,0.0,0.0,0.095238,0,0,1,0.0,0.095238,0.0
3,1.341641,1.543033,1.234427,1.8,2.070197,1.656157,2.380952,1.904762,1.52381,1,0,0,2.380952,1.52381,1.904762


5. Feature Selection

In [193]:
# Selecting the best 2 features based on the ANOVA F-value between feature and target.
# The F-value is a statistic used to compare the variances between different groups, 
# and in the context of feature selection, it assesses whether the mean of the target variable differs significantly across the different values of the feature.
# Determine which features show a strong statistical relationship with the target variable.
X = df.drop('y', axis=1)
y = df['y']
selector = SelectKBest(f_classif, k=2) 
X_new = selector.fit_transform(X, y)
X_new

  msw = sswn / float(dfwn)
  msw = sswn / float(dfwn)


array([[ 2.38095238,  1.9047619 ],
       [ 0.        , -0.        ],
       [ 0.0952381 ,  0.        ],
       [ 1.52380952,  1.9047619 ]])