# 1. Setup

In [1]:
# Structures to make easier for us to perform the procedures
from src.entities import EnvironmentConfiguration
environment_configuration = EnvironmentConfiguration()

In [2]:
# Create folders
import os
os.makedirs(environment_configuration.artifacts_folder, exist_ok= True)
os.makedirs(environment_configuration.eda_folder, exist_ok= True)
os.makedirs(environment_configuration.two_way_tables_folder, exist_ok= True)
os.makedirs(environment_configuration.plots_folder, exist_ok= True)
os.makedirs(environment_configuration.univariate_plots_folder, exist_ok= True)
os.makedirs(environment_configuration.bivariate_plots_folder, exist_ok= True)
os.makedirs(environment_configuration.y_data_profiling_folder, exist_ok= True)
os.makedirs(environment_configuration.pipelines_folder, exist_ok= True)

In [3]:
# Import raw data:
import pandas as pd
raw_data = pd.read_csv(environment_configuration.raw_data_folder)

In [8]:
raw_data

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


# 2. Explore the data to gain insights.


## 1. Data Quality

In [5]:
# Pandas profiling
from ydata_profiling import ProfileReport 

profile = ProfileReport(raw_data, explorative=True)

report_path = (environment_configuration.y_data_profiling_file)
profile.to_file(report_path)

print(f"Report saved to {report_path}")

  from .autonotebook import tqdm as notebook_tqdm
Summarize dataset: 100%|██████████| 60/60 [00:16<00:00,  3.66it/s, Completed]                             
Generate report structure: 100%|██████████| 1/1 [00:08<00:00,  8.96s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.34s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 62.50it/s]

Report saved to artifacts\exploratory_data_analysys\y_data_profiling\data_profiling.html





In [None]:
# Unique Values
from src.eda.unique_values import export_unique_values_to_excel
export_unique_values_to_excel(raw_data, environment_configuration)

Excel file created successfully: artifacts\exploratory_data_analysys\unique_values_spreadsheet.xlsx


In [6]:
# Unique Values Count
raw_data.nunique()

age                  73
workclass             9
fnlwgt            21648
education            16
education.num        16
marital.status        7
occupation           15
relationship          6
race                  5
sex                   2
capital.gain        119
capital.loss         92
hours.per.week       94
native.country       42
income                2
dtype: int64

In [10]:
# Check missing values:
raw_data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [None]:
# Check pandas dataframe metadata
raw_data.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

The dtypes are correct, according to the reality

## 2. Study each attribute and its characteristics:


Now we can define the feature definition variable that will be fed to the code which is going to perform the data Visualization 

In [11]:
from src.entities import FeatureDefinition
feature_defintion = FeatureDefinition(
    type_of_task= "binary_classification",
    data_frame = raw_data,
    numeric = ["age", "fnlwgt", "capital.gain", "capital.loss", "hours.per.week"],
    categorical_ordinals = ["education.num",],
    categorical_binary = ["income", "sex"],
    categorical_nominals= ["workclass", "marital.status", "occupation", "relationship", "race", "native.country"]
)

## 1. Visualize the data.


In [None]:
# Bivariate and Univariate Plots 
from src.eda.plots import InteractiveEDAPlotter
interactive_eda_plotter = InteractiveEDAPlotter(env_config= environment_configuration,
                                              feature_def= feature_defintion)
interactive_eda_plotter.perform_eda()

The conclusions are in `Data_Exploration_Conclusions.md`, please refer to this file.