In [7]:
# Import necessary modules
import data_preprocessor as dp
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Load the dataset
messy_data = pd.read_csv('../Data/messy_data.csv')
# messy_data.head()
# messy_data.info()
# messy_data.describe()
clean_data = messy_data.copy()

# 2. Preprocess the data
clean_data = dp.impute_missing_values(clean_data, strategy='median')
clean_data = dp.remove_duplicates(clean_data)
clean_data = dp.normalize_data(clean_data, method='standard')
clean_data = dp.remove_redundant_features(clean_data)

# 3. Save the cleaned dataset
clean_data.to_csv('../Data/clean_data.csv', index=False)

# 4. Train and evaluate the model
dp.simple_model(clean_data, split_data=True, scale_data=True, print_report=True)

Accuracy: 0.7644
Classification Report:
              precision    recall  f1-score   support

          -1       0.73      0.65      0.68        82
           0       0.79      0.84      0.81       126

    accuracy                           0.76       208
   macro avg       0.76      0.74      0.75       208
weighted avg       0.76      0.76      0.76       208

Read more about the classification report: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html and https://www.nb-data.com/p/breaking-down-the-classification


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [6]:
## Feature Checking and Summary Statistics ##

# Checking data structure and key statistics
print("Messy Dataset Summary:")
messy_data.info()

print("Cleaned Dataset Summary:")
clean_data.info()

# Count missing values before and after cleaning
missing_before = messy_data.isnull().sum().sum()
missing_after = clean_data.isnull().sum().sum()

# Identifying rows removed from datasets
rows_removed = len(messy_data) - len(clean_data)

# Identifying columns removed from datasets
cols_removed = len(messy_data.columns) - len(clean_data.columns)

print(f"Total missing values before preprocessing: {missing_before}")
print(f"Total missing values after preprocessing: {missing_after}")
print(f"Number of rows removed: {rows_removed}")
print(f"Number of columns removed: {cols_removed}")

# Computing correlation matrix
numeric_data = messy_data.select_dtypes(include=[np.number]) # Selecting only numeric columns 
corr_matrix = numeric_data.corr().abs() # Computing correlation matrix

# Identifying highly correlated features and those removed due to redundancy
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
removed_features = [col for col in upper_tri.columns if any(upper_tri[col] > 0.9)]

print(f"Number of features removed due to redundancy: {len(removed_features)}")
print(f"Removed Features: {removed_features}")

Messy Dataset Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1196 entries, 0 to 1195
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  920 non-null    float64
 1   a       1193 non-null   object 
 2   b       1196 non-null   float64
 3   c       1196 non-null   int64  
 4   d       561 non-null    object 
 5   e       1196 non-null   object 
 6   f       1196 non-null   float64
 7   g       1196 non-null   object 
 8   h       1158 non-null   float64
 9   i       1087 non-null   object 
 10  j       591 non-null    float64
 11  k       1196 non-null   float64
 12  l       1196 non-null   float64
 13  m       778 non-null    object 
 14  n       1012 non-null   float64
 15  o       1008 non-null   float64
 16  p       1122 non-null   float64
 17  q       1009 non-null   float64
 18  r       1196 non-null   float64
 19  s       1196 non-null   object 
 20  t       1110 non-null   float64
 21  u       1079 n