# Data Exploration Module Test

# TODO:
    - head (/)
    - info (/)
    - matrix (missing values) (/)
    - bar (missing values) (/)
    - imports Jupyter Notebook (/)
    - fix plot resolution (/)

    - data dict (types, e.g. nominal, categorial)
    - box (numeric, deviation)
    - bar/mosaic/ (categorial, deviation)
    - predictor/feature correlation (heatmap/scatter)
    - histogram (skewed/deviation)


In [1]:
import idstools.data_explorer as idsde

In [2]:
test_data = "../data/BikeRentalDaily_test.csv"
train_data = "../data/BikeRentalDaily_train.csv"

In [3]:
data_explorer_config = {"path": train_data, "type": "csv", "separator": ";"}

In [4]:
data_explorer = idsde.DataExplorer(input_path=train_data, output_path="results")

2024-02-01 21:14:47,358 [data_explorer] [INFO] - Initializing DataExplorer
2024-02-01 21:14:47,361 [_helpers] [INFO] - Reading csv file:
../data/BikeRentalDaily_train.csv
2024-02-01 21:14:47,367 [data_explorer] [INFO] - Using output path: results
2024-02-01 21:14:47,368 [data_explorer] [INFO] - Using pipeline:
{}



In [5]:
data_explorer.descriptive_analysis()
data_explorer.data.info()

2024-02-01 21:14:47,382 [data_explorer] [INFO] - Head of BikeRentalDaily_train
                          0           1           2           3           4
instant                 154         685         368         472         442
dteday           03.06.2011  15.11.2012  03.01.2012  16.04.2012  17.03.2012
season                  2.0         4.0         1.0         2.0         1.0
yr                        0           1           1           1           1
mnth                      6          11           1           4           3
holiday                   0           0           0           1           0
weekday                   5           4           2           1          -1
workingday                1           1           1           0           0
weathersit                1           2           1           1           2
temp                   24.8       12.87         6.0       26.57       20.57
atemp                  0.59        0.32        0.13        0.61        0.51
hum      

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   instant          600 non-null    int64  
 1   dteday           600 non-null    object 
 2   season           538 non-null    float64
 3   yr               600 non-null    int64  
 4   mnth             600 non-null    int64  
 5   holiday          600 non-null    int64  
 6   weekday          600 non-null    int64  
 7   workingday       600 non-null    int64  
 8   weathersit       600 non-null    int64  
 9   temp             600 non-null    float64
 10  atemp            600 non-null    float64
 11  hum              566 non-null    float64
 12  windspeed        600 non-null    float64
 13  leaflets         600 non-null    int64  
 14  price reduction  600 non-null    int64  
 15  casual           600 non-null    int64  
 16  registered       600 non-null    int64  
 17  cnt             

# Data Preparation Module Test

In [6]:
import idstools.data_preparation as dp

In [7]:
test_data = "../data/BikeRentalDaily_test.csv"
train_data = "../data/BikeRentalDaily_train.csv"

In [8]:
data_preparation = dp.DataPreparation(input_path=train_data, output_path="results")


2024-02-01 21:14:47,747 [data_preparation] [INFO] - Initializing DataPreparation
2024-02-01 21:14:47,749 [_helpers] [INFO] - Reading csv file:
../data/BikeRentalDaily_train.csv
2024-02-01 21:14:47,752 [data_preparation] [INFO] - Using output path: results
2024-02-01 21:14:47,753 [data_preparation] [INFO] - Using pipeline:
{}



In [9]:
import pandas as pd

def get_wday_by_date(df, date_column, weekday_column):
    # Define the weekday shift
    weekday_shift = {
        6: 0,
        0: 1,
        1: 2,
        2: 3,
        3: 4,
        4: 5,
        5: 6
    }

    # Convert the date column to datetime
    df[date_column] = pd.to_datetime(df[date_column], format="%d.%m.%Y")

    # Calculate the weekday and map it
    df[weekday_column] = df[date_column].dt.dayofweek.map(weekday_shift)

    return df

In [10]:
pipeline_config = {
        "_SimpleImputer": [
            {
                "target": "hum",
                "config": {
                    "strategy": "mean"
                }
            }
        ],
        "_GenericDataFrameTransformer": [
        {
            "transform_func": get_wday_by_date,
            "config": {
                "date_column": "dteday",
                "weekday_column": "weekday"
            }
        }
    ]
}

In [11]:
pipeline = data_preparation.build_pipeline(config=pipeline_config)
pipeline

2024-02-01 21:14:47,779 [data_preparation] [INFO] - Pipeline created.


In [12]:
processed_data = data_preparation.run_pipeline(config=pipeline_config)

2024-02-01 21:14:47,797 [data_preparation] [INFO] - Pipeline step _SimpleImputer has been processed.
2024-02-01 21:14:47,802 [data_preparation] [INFO] - Pipeline step _GenericDataFrameTransformer has been processed.


In [13]:
processed_data.head(5).T

Unnamed: 0,0,1,2,3,4
instant,154,685,368,472,442
dteday,2011-06-03 00:00:00,2012-11-15 00:00:00,2012-01-03 00:00:00,2012-04-16 00:00:00,2012-03-17 00:00:00
season,2.0,4.0,1.0,2.0,1.0
yr,0,1,1,1,1
mnth,6,11,1,4,3
holiday,0,0,0,1,0
weekday,5,4,2,1,6
workingday,1,1,1,0,0
weathersit,1,2,1,1,2
temp,24.8,12.87,6.0,26.57,20.57


In [14]:
processed_data.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
instant,600.0,363.12,1.0,181.25,362.5,538.25,731.0,208.71
dteday,600.0,2011-12-29 02:48:00,2011-01-01 00:00:00,2011-06-30 06:00:00,2011-12-28 12:00:00,2012-06-21 06:00:00,2012-12-31 00:00:00,
season,538.0,2.44,1.0,1.0,2.0,3.0,4.0,1.11
yr,600.0,0.5,0.0,0.0,0.0,1.0,1.0,0.5
mnth,600.0,6.47,1.0,4.0,6.0,9.0,12.0,3.44
holiday,600.0,0.03,0.0,0.0,0.0,0.0,1.0,0.17
weekday,600.0,3.03,0.0,1.0,3.0,5.0,6.0,2.01
workingday,600.0,0.68,0.0,0.0,1.0,1.0,1.0,0.47
weathersit,600.0,1.4,1.0,1.0,1.0,2.0,3.0,0.54
temp,600.0,19.81,2.37,13.57,20.1,26.06,34.47,7.21


In [15]:
from idstools._config import _idstools

In [16]:
_idstools["default"]["data_explorer"]["DataExplorer"]["input_path"]

'data/BikeRentalDaily_train.csv'

In [17]:
_idstools.default.data_explorer.DataExplorer.input_path

'data/BikeRentalDaily_train.csv'

## Module Configuration

In [18]:
from idstools.data_explorer import DataExplorer
from idstools._config import _idstools, pprint_dynaconf

We have multiple options to configure the DataExplorer to analyze the BikeRentalDaily_train.csv data.

- Load the default set of parameters and adjust them to our needs. In this case all possible parameters are initialized and can be set according the the exploration steps that should be done. 

- Initialize the class with in cell defined configuration.

In [19]:
pprint_dynaconf(_idstools, notebook=True)

```yaml
DEFAULT:
  data_explorer:
    DataExplorer:
      output_path: results
      input_path: data/BikeRentalDaily_train.csv
      input_type: csv
      input_delimiter: ;
      pipeline:
        descriptive_analysis: true
        missing_value_matrix_plot: true
        missing_value_bar_plot: true
        correlation_heatmap_plot: true
  data_preparation:
    DataPreparation:
      output_path: null
      input_path: data/BikeRentalDaily_train.csv
      input_type: csv
      input_delimiter: ;
      pipeline:
        _SimpleImputer:
        - target: hum
          config:
            strategy: mean
        _OneHotEncoder:
        - target: season
          config:
            prefix: season
            dtype: int
        - target: mnth
          config:
            prefix: month
            dtype: int
        _FeatureDropper:
        - target: instant
          config:
            axis: 1
            errors: ignore
  model_optimization:
    ModelOptimization:
      output_path: results
      evaluation:
        metric: rmse
        cv: 5
CUSTOM:
  data_explorer:
    DataExplorer:
      output_path: results
      input_path: data/BikeRentalDaily_test.csv
      input_type: csv
      input_delimiter: ;
      pipeline:
        descriptive_analysis: true
        missing_value_matrix_plot: true
        missing_value_bar_plot: true
        correlation_heatmap_plot: true
  data_preparation:
    DataPreparation:
      output_path: null
      input_path: data/BikeRentalDaily_test.csv
      input_type: csv
      input_delimiter: ;
      pipeline:
        _FeatureDropper:
        - target: instant
          config:
            axis: 1
            errors: ignore
        - target: hum
          config:
            axis: 1
            errors: ignore
        - target: windspeed
          config:
            axis: 1
            errors: ignore
  model_optimization:
    ModelOptimization:
      output_path: results
      evaluation:
        metric: mse
        cv: 10

```

In [20]:
config = _idstools.default.data_explorer.DataExplorer

In [21]:
pprint_dynaconf(config, notebook=True)

```yaml
output_path: results
input_path: data/BikeRentalDaily_train.csv
input_type: csv
input_delimiter: ;
pipeline:
  descriptive_analysis: true
  missing_value_matrix_plot: true
  missing_value_bar_plot: true
  correlation_heatmap_plot: true

```

In [27]:
pprint_dynaconf(_idstools.custom.data_explorer.DataExplorer, notebook=True)

```yaml
output_path: results
input_path: data/BikeRentalDaily_test.csv
input_type: csv
input_delimiter: ;
pipeline:
  descriptive_analysis: true
  missing_value_matrix_plot: true
  missing_value_bar_plot: true
  correlation_heatmap_plot: true

```

In [22]:
data_explorer_config = config

In [23]:
data_explorer_config.input_path = "/home/davidrmn/Studies/introduction-data-science/data/BikeRentalDaily_train.csv"

In [24]:
my_data_explorer = DataExplorer(**data_explorer_config)

2024-02-01 21:14:48,010 [data_explorer] [INFO] - Initializing DataExplorer
2024-02-01 21:14:48,012 [_helpers] [INFO] - Reading csv file:
/home/davidrmn/Studies/introduction-data-science/data/BikeRentalDaily_train.csv
2024-02-01 21:14:48,016 [data_explorer] [INFO] - Using output path: results
2024-02-01 21:14:48,017 [data_explorer] [INFO] - Using pipeline:
descriptive_analysis: true
missing_value_matrix_plot: true
missing_value_bar_plot: true
correlation_heatmap_plot: true



In [25]:
result = my_data_explorer.descriptive_analysis()

2024-02-01 21:14:48,024 [data_explorer] [INFO] - Head of BikeRentalDaily_train
                          0           1           2           3           4
instant                 154         685         368         472         442
dteday           03.06.2011  15.11.2012  03.01.2012  16.04.2012  17.03.2012
season                  2.0         4.0         1.0         2.0         1.0
yr                        0           1           1           1           1
mnth                      6          11           1           4           3
holiday                   0           0           0           1           0
weekday                   5           4           2           1          -1
workingday                1           1           1           0           0
weathersit                1           2           1           1           2
temp                   24.8       12.87         6.0       26.57       20.57
atemp                  0.59        0.32        0.13        0.61        0.51
hum      