In [688]:
import pandas as pd
import yaml
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from typing import List, Dict, Tuple, Set, Union, Any
sb.set()

# Exploratory Data Analysis Functions

## load_parameters()
- input: path of yaml file
- returns: parameters for the 
- description: will load the following parameters (in dictionary format):
    - include_features
    - Clean up
    - Types of EDA
    - 
- remarks: parameter file should contain the following details

In [None]:
def load_parameters(folder_path: str) -> Dict[str, Union[List, Dict[str, str]]]:

    parameters : Dict[str, Union[List, Dict[str, str]]] = yaml.safe_load(open(f"{folder_path}/parameters.yaml"))

    return parameters

## load_dataset()
- input: parameters
- returns: dataframe with datatypes formatted 
- description: Function will do the following tasks:
    - load the dataset (in this case xxxxxxx.csv)
    - load the yaml to format the features to the appropiate dataypes (in this case data_types.yaml)
- remarks: pandas dataframe

In [None]:
def load_dataset(parameters: Dict[str, Any]) -> pd.DataFrame:

    folder_path: str = parameters['dataset_location']

    # Support multi dataset and load into one dataframe
    # NOTE: dataset must have the same features. Have not tried dataset with different features
    def combined_datasets(datasets: List[str]) -> pd.DataFrame:
        load_datasets: List[pd.DataFrame] = list()

        for dataset in datasets:
            print(f"dataset to load: {dataset}")
            temp_dataset: pd.DataFrame = pd.read_csv(f"{folder_path}/{dataset}.csv")
            load_datasets.append(temp_dataset)

        print(f"load_dataset size: {len(load_datasets)}")

        return pd.concat(load_datasets)
    
    # assign the data types preset in the data_types.yaml
    def reassign_features_dataypes(dataset: pd.DataFrame) -> None:
        config: Dict[str, str] = yaml.safe_load(open(f"{folder_path}/data_types.yaml"))

        for feature, datatype in config['data_types'].items():
            dataset[feature] = dataset[feature].astype(datatype)

    dataset = combined_datasets(parameters['datasets_to_load'])
    reassign_features_dataypes(dataset)

    # for loop used to debug to ensure features are in the right data type
    # for column in dataset.columns:
    #     print(f"{column}: {dataset[column].dtype}")

    return dataset

## clean_up_dataset()
- input: 
    - Dataframe
    - parameters 
- returns: Dataframe
- description: Function does the following to the dataset
    - clean up 
    - drop row that contains NULL/NAN values
    - extract the interested features
- remarks: parameter input is dictionary data that loads from parameters.yaml file which contains all the configuration required for the clean up

In [689]:
def clean_up_dataset(
        dataframe: pd.DataFrame, 
        parameters: Dict[str, Union[List, Dict[str, str]]]
    ) -> pd.DataFrame:

    ## CLEAN UP RELATED
    # remove of rows based on constraint values set in the parameters 
    cleanup_parameters: Dict[str, int] = parameters['clean_up']
    
    for feature, constraint in cleanup_parameters.items():
        dataframe = dataframe[ dataframe[feature] > constraint]

    ## NULL/NAN RELATED
    # to print any null values
    # print(dataframe.isnull().sum())

    # drop any row that contains NULL/NAN values
    if dataframe.isnull().values.any():
        dataframe = dataframe.dropna()

    ## DUPLICATE RELATED
    # method to print duplicates on specific column
    # print(f"Duplicated instructors: \n{dataframe['instructor_name'].value_counts(ascending=False)}")
    # instructor_names = pd.DataFrame(dataframe['instructor_name'].value_counts(ascending=False))
    # instructor_names.to_csv('instructor_names.csv')

    # method to print duplicates exists on specific column
    # for columns in dataframe.columns:
    #     print(f"Duplicated {columns}: {dataframe[columns].duplicated().any()}")

    # total_duplicated_ids = dataframe[dataframe.duplicated('id', keep=False)]
    # print(f"Course with duplicated ids: {len(total_duplicated_ids)}")
    # add condition if there is duplicates

    ## EXTRACT FEATURE RELATED
    # create a new dataframe to extract the interested feature (set in the parameters)
    include_features: List[str] = parameters['include_features']
    extracted_dataset = pd.DataFrame( dataframe[include_features] )
    # extracted_dataset.info()

    return extracted_dataset.reset_index(drop=True)

## get_outlier_samples()
- inputs: Dataframe
- returns: Series of outliers based on supplied dataframe
- description: identify the outliers based on the supplied dataframe
- remarks: None

In [691]:
def get_outlier_samples(dataframe: pd.DataFrame) -> pd.core.series.Series:

    q1 = dataframe.quantile(0.25)
    q3 = dataframe.quantile(0.75)
    interquartile_range = q3-q1

    lower_whisker = q1-1.5*interquartile_range
    upper_whisker = q3+1.5*interquartile_range
    outliers: pd.core.series.Series = ((dataframe < lower_whisker) | (dataframe > upper_whisker))

    return outliers

## print_feature_outliers()
- inputs: Dataframe
- returns: None
- description: prints number of outliers for every numerical features/column
- remarks: None

In [692]:
def print_feature_outliers(dataframe: pd.DataFrame) -> None:

    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])

    for column in numerical_dataframe.columns:

        outliers = sum(get_outlier_samples(numerical_dataframe[column]))
        print(f"[{column}] total outliers: {outliers}")

## remove_outliers()
- inputs: Dataframe
- returns: Dataframe with outliers removed for every numerical feature/column 
- description: Function remove **UNION** outlier of the dataset. In other words remove the entire row containing outliers
- remarks: None

In [693]:
def remove_outliers(dataframe: pd.DataFrame) -> pd.DataFrame:

    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])

    union_outliers = (get_outlier_samples(numerical_dataframe)).any(axis=1)
    dataframe_with_outliers_removed: pd.DataFrame = dataframe[~union_outliers].reset_index(drop=True)

    # print(f"Total 'UNION' outliers: {sum(union_outliers)}")
    # instructor_names = pd.DataFrame(dataframe_with_outliers_removed['instructor_name'].value_counts(ascending=False))
    # instructor_names.to_csv('instructor_names.csv')
    return dataframe_with_outliers_removed

## generate_numerical_eda_visualization()
- inputs: 
    - dataframe
    - List of catergorical features/columns which are numerical types
    - plot title
- returns: None
- description: generates box, histo and violin plot for every numerical features (column) of the dataset
- remakrs: None

In [None]:
def generate_numerical_eda_visualization(
        dataframe: pd.DataFrame,
        plot_title: str
    ) -> None:
    
    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])
    total_features = len(numerical_dataframe.columns)
    figure, axes = plt.subplots(
        total_features, 
        3, 
        figsize=(24,4.8*total_features)
    )
    
    # figure.suptitle(plot_title, fontsize=20)
    axes[0, 1].set_title(plot_title, fontsize=25)
    row = 0
    for column in numerical_dataframe.columns:
        sb.boxplot(data=numerical_dataframe[column], orient='h', ax=axes[row,0])
        sb.histplot(data=numerical_dataframe[column], ax=axes[row,1])
        sb.violinplot(data=numerical_dataframe[column], orient='h', ax=axes[row,2])
        row = row + 1

## generate_numerical_heatmap()
- inputs: 
    - dataframe
    - plot title
- returns: None
- description: generates heatmap for every numerical features (column) of the dataset
- remarks: None

In [696]:
def generate_numerical_heatmap(
        dataframe: pd.DataFrame,
        plot_title: str
    ) -> None:
    
    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])

    plt.figure(figsize=(13, 13))
    plt.title(plot_title, fontsize=20)
    sb.heatmap(numerical_dataframe.corr(), vmin = -1, vmax = 1, linewidths = 1,
        annot = True, fmt = ".2f", annot_kws = {"size": 18}, cmap = "RdBu"
    )
    plt.show()