The data was provided in an Excel file. Part of the feature engineering has already been carried out. Therefore, not the entire feature engineering process can be traced using this code. 

In [None]:

import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, mutual_info_regression

# check the computer name and set the path accordingly
if os.environ['COMPUTERNAME'] == 'FYNN':            # name of surface PC
    sys.path.append(r'C:\Users\Surface\Masterarbeit')
elif os.environ['COMPUTERNAME'] == 'FYNNS-PC':  # desktop name
    sys.path.append(r'C:\Users\test\Masterarbeit')
    
else:
    raise ValueError("Unbekannter Computername: " + os.environ['COMPUTERNAME'])

from config import SAVE_PATH, RAW_DATA_PATH

Explore the data

In [None]:
prod_data_df = pd.read_excel(RAW_DATA_PATH)

#get some insights into the data
display(prod_data_df.describe())
display(prod_data_df.info())

# Check for categorial variables, i.e. variables with a low number of unique values
number_unique = prod_data_df.nunique(axis = 0).sort_values(ascending=False)
display(number_unique)

Separate Target from the independet feauter

Distribution of the target value

In [None]:
# separate features and target variable
X = prod_data_df.drop(columns=["TARGET"])
y = prod_data_df["TARGET"] 

# plot the target data distribution
sns.histplot(y, bins=50, kde=True)
plt.xlabel('Flange length [Pixel]')  # Ã„ndere hier die X-Achsen-Beschriftung
plt.tight_layout()
plt.savefig(os.path.join(SAVE_PATH, "Target distribution.svg"), format ='svg')
plt.show()
print(f'Mean: {y.mean()}')
print(f'std deviation: {y.std()}')

Korrelation of the environmental feature

In [None]:
df_env = X[[
            'env_param_1', 'env_param_2','env_param_3', 'env_param_4', 
            'env_param_5', 'env_param_6', 'env_param_7','env_param_8', 
            'env_param_9', 'env_param_10', 'env_param_11', 'env_param_12', 
            'env_param_13', 'env_param_14', 'env_param_15', 'env_param_16',
            'env_param_17', 'env_param_18', 'env_param_19', 'env_param_20',
            'env_param_21', 'env_param_22', 'env_param_23', 'env_param_24',
            'env_param_25', 'env_param_26', 'env_param_27', 'env_param_28',
            'env_param_29', 'env_param_30', 'env_param_31', 'env_param_32',
            'env_param_33', 'env_param_34', 'env_param_35', 'env_param_36'
            ]]

corr_env = df_env.corr()
plt.figure(figsize=(20,15))
sns.heatmap(corr_env, annot=True, cmap='coolwarm',  fmt='.2f')
plt.title('Korrelationsmatrix Environmental Features')
plt.savefig(os.path.join(SAVE_PATH, "Environmental Features correlation matrix.svg"), format ='svg')
plt.show()

# Drop Env_param 16 to 36
X_1 = X.drop(columns=['env_param_16',
                        'env_param_17', 'env_param_18', 'env_param_19', 'env_param_20',
                        'env_param_21', 'env_param_22', 'env_param_23', 'env_param_24',
                        'env_param_25', 'env_param_26', 'env_param_27', 'env_param_28',
                        'env_param_29', 'env_param_30', 'env_param_31', 'env_param_32',
                        'env_param_33', 'env_param_34', 'env_param_35', 'env_param_36'
                        ])

Filter for Feature Selection

F_Regression for linear Relationship:

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression

Mutual information for any Relationship:

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html#sklearn.feature_selection.


In [None]:
# both methods are used to determine the importance of each feature in relation to the target variable
# calculate the F-values and p-values for each feature
# f_regression returns the F-values and p-values for each feature
f_vals, p_vals = f_regression(X_1, y)

# calculate the mutual information scores for each feature
mi_vals = mutual_info_regression(X_1, y, n_jobs = -1)

# get the feature names
feature_names = X_1.columns

# create a dataframe with the feature names and the mutual information scores and f-values
feature_scores_MI = pd.DataFrame({
    'Feature': feature_names,
    'Mutual Information': mi_vals,
})
feature_scores_F = pd.DataFrame({
    'Feature': feature_names,
    'F_value': f_vals,
    'P_value': p_vals
})

# sort the feature scores by mutual information and F-value
feature_scores_MI_sorted = feature_scores_MI.sort_values(by='Mutual Information', ascending=False)
feature_scores_F_sorted = feature_scores_F.sort_values(by='F_value', ascending=False)

# merge the two dataframes on the feature names to get a combined view of the feature importance
feature_scores = pd.merge(feature_scores_MI_sorted, feature_scores_F_sorted, on='Feature')

# list to store features with low scores
low_feature_scores =[]
# identify features with low mutual information and F-value
for i, row in feature_scores.iterrows():
    if row['Mutual Information'] < 0.03 and row['F_value'] < 10:
        low_feature_scores.append(row['Feature'])
        
# Drop the non informative features  
X_2 = X_1.drop(columns=low_feature_scores)