In [1]:
import pandas as pd
from pandas.core.groupby.generic import DataFrameGroupBy
import yaml
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from dython import nominal
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import train_test_split
# import datetime
from datetime import date

from typing import List, Dict, Tuple, Union, Any
sb.set()

# Exploratory Data Analysis Functions

## load_parameters()
- input: path of yaml file
- returns: parameters for the 
- description: will load the following parameters (in dictionary format):
    - include_features
    - Clean up
    - Types of EDA
    - 
- remarks: parameter file should contain the following details

In [2]:
def load_parameters(folder_path: str) -> Dict[str, Union[List, Dict[str, str]]]:

    parameters : Dict[str, Union[List, Dict[str, str]]] = yaml.safe_load(open(f"{folder_path}/parameters.yaml"))

    return parameters

## load_dataset()
- input: parameters
- returns: dataframe with datatypes formatted 
- description: Function will do the following tasks:
    - load the dataset (in this case xxxxxxx.csv)
    - load the yaml to format the features to the appropiate dataypes (in this case data_types.yaml)
- remarks: 
    - Features are not mentioned in the data_types.yaml will be remained as default datatypes
    - **Have not tested datasets of different features**

In [3]:
def load_dataset(parameters: Dict[str, Any]) -> pd.DataFrame:

    folder_path: str = parameters['dataset_location']

    # Support multi dataset and load into one dataframe
    def combined_datasets(datasets: List[str]) -> pd.DataFrame:
        load_datasets: List[pd.DataFrame] = list()

        for dataset in datasets:
            # print(f"dataset to load: {dataset}")
            temp_dataset: pd.DataFrame = pd.read_csv(f"{folder_path}/{dataset}.csv")
            load_datasets.append(temp_dataset)

        # print(f"load_dataset size: {len(load_datasets)}")

        return pd.concat(load_datasets, ignore_index=True)
    
    # assign the data types preset in the data_types.yaml
    def reassign_features_dataypes(dataset: pd.DataFrame) -> None:
        config: Dict[str, str] = yaml.safe_load(open(f"{folder_path}/data_types.yaml"))
        config_datatypes: Dict[str, str] = config['data_types']
        dataset_features: List[str] = dataset.columns.to_list()
        
        for dataset_feature in dataset_features:
            if dataset_feature not in config_datatypes.keys():
                # print(f"'{dataset_feature}' not in {config_datatypes.keys()}")
                continue            
            dataset[dataset_feature] = dataset[dataset_feature].astype(config_datatypes[dataset_feature])

    dataset = combined_datasets(parameters['datasets_to_load'])
    # dataset.info()
    # print()
    reassign_features_dataypes(dataset)

    return dataset

## clean_up_dataset()
- input: 
    - Dataframe
    - parameters 
- returns: Dataframe
- description: Function does the following to the dataset
    - clean up 
    - drop row that contains NULL/NAN values
    - extract the interested features
- remarks: parameter input is dictionary data that loads from parameters.yaml file which contains all the configuration required for the clean up

In [4]:
def clean_up_dataset(
        dataframe: pd.DataFrame, 
        parameters: Dict[str, Any]
    ) -> pd.DataFrame:

    def morph_feature_type(dataset: pd.DataFrame) -> None:
        morph_feature_configs: Dict[str, str] = parameters['morph_feature_type']

        for morph_feature, morph_feature_type in morph_feature_configs.items():
            # print(f"{morph_feature} with type {morph_feature_type}")
            # dataset[morph_feature].str.extract('(\d+)').astype(int)
            dataset[morph_feature]=dataset[morph_feature].str.extract('(\d+)').astype(morph_feature_type)

    def add_numerical_features(dataset: pd.DataFrame) -> None:
        custom_numerical_features: List[Dict[str, Any]] = parameters['custom_numerical_features']
        # print(f"custom_numerical_features: {custom_numerical_features}")

        for new_feature in custom_numerical_features:
            # print(f"new_feature: {new_feature}, name: {list(new_feature.keys())[0]}, items: {new_feature.values()}")

            new_feature_name: str = list(new_feature.keys())[0]
            derived_features: List[str] = new_feature[new_feature_name]['derived_features']
            action_type: str = new_feature[new_feature_name]['action_type']
            data_type: str = new_feature[new_feature_name]['data_type']
            # print(f"new_feature: {new_feature}, name: {new_feature_name}, derived_features: {derived_features}, action_type: {action_type}, data_type: {data_type}")

            dataset[new_feature_name] = pd.Series(dtype=data_type)

            for index, feature in enumerate(derived_features):
                print(f"Add new column action= {index}:{feature}")
                if index == 0:
                    dataset[new_feature_name] = dataset[feature]
                
                elif action_type == 'division':
                    dataset[new_feature_name] = dataset[new_feature_name]//dataset[feature]

        
    ## EXTRACT FEATURE RELATED
    # create a new dataframe to extract the interested feature (set in the parameters)
    include_features_config: List[str] = parameters['include_features']
    extracted_dataset = pd.DataFrame( dataframe[include_features_config] )

    morph_feature_type(extracted_dataset)
    add_numerical_features(extracted_dataset)

    # print(f"Empty row \n{extracted_dataset.isnull().sum()}")

    return extracted_dataset.reset_index(drop=True)

## get_outlier_samples()
- inputs: Dataframe
- returns: Series of outliers based on supplied dataframe
- description: identify the outliers based on the supplied dataframe
- remarks: None

In [5]:
def get_outlier_samples(dataframe: pd.DataFrame) -> pd.core.series.Series:

    q1 = dataframe.quantile(0.25)
    q3 = dataframe.quantile(0.75)
    interquartile_range = q3-q1

    lower_whisker = q1-1.5*interquartile_range
    upper_whisker = q3+1.5*interquartile_range
    outliers: pd.core.series.Series = ((dataframe < lower_whisker) | (dataframe > upper_whisker))

    return outliers

## print_feature_outliers()
- inputs: Dataframe
- returns: None
- description: prints number of outliers for every numerical features/column
- remarks: None

In [6]:
def print_feature_outliers(dataframe: pd.DataFrame) -> None:

    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])

    for column in numerical_dataframe.columns:

        outliers = sum(get_outlier_samples(numerical_dataframe[column]))
        print(f"[{column}] total outliers: {outliers}")

## remove_outliers()
- inputs: Dataframe
- returns: Dataframe with outliers removed for every numerical feature/column 
- description: Function remove **UNION** outlier of the dataset. In other words remove the entire row containing outliers
- remarks: None

In [7]:
def remove_outliers(dataframe: pd.DataFrame) -> pd.DataFrame:

    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])

    union_outliers = (get_outlier_samples(numerical_dataframe)).any(axis=1)
    dataframe_with_outliers_removed: pd.DataFrame = dataframe[~union_outliers].reset_index(drop=True)

    # print(f"Total 'UNION' outliers: {sum(union_outliers)}")
    # instructor_names = pd.DataFrame(dataframe_with_outliers_removed['instructor_name'].value_counts(ascending=False))
    # instructor_names.to_csv('instructor_names.csv')
    return dataframe_with_outliers_removed

## generate_numerical_eda_visualization()
- inputs: 
    - dataframe
    - List of catergorical features/columns which are numerical types
    - plot title
- returns: None
- description: generates box, histo and violin plot for every numerical features (column) of the dataset
- remakrs: None

In [8]:
def generate_numerical_eda_visualization(
        dataframe: pd.DataFrame,
        plot_title: str
    ) -> None:
    
    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])
    total_features = len(numerical_dataframe.columns)
    figure, axes = plt.subplots(
        total_features, 
        3, 
        figsize=(24,4.8*total_features)
    )
    
    # figure.suptitle(plot_title, fontsize=20)
    axes[0, 1].set_title(plot_title, fontsize=25)
    row = 0
    for column in numerical_dataframe.columns:
        sb.boxplot(data=numerical_dataframe[column], orient='h', ax=axes[row,0])
        sb.histplot(data=numerical_dataframe[column], ax=axes[row,1])
        sb.violinplot(data=numerical_dataframe[column], orient='h', ax=axes[row,2])
        row = row + 1

## generate_numerical_heatmap() - MAY NEED TO REMOVE
- inputs: 
    - dataframe
    - plot title
- returns: None
- description: generates heatmap for every numerical features (column) of the dataset
- remarks: None

In [9]:
def generate_numerical_heatmap(
        dataframe: pd.DataFrame,
        plot_title: str
    ) -> None:
    
    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])

    plt.figure(figsize=(13, 13))
    plt.title(plot_title, fontsize=20)
    sb.heatmap(numerical_dataframe.corr(), vmin = -1, vmax = 1, linewidths = 1,
        annot = True, fmt = ".2f", annot_kws = {"size": 18}, cmap = "RdBu"
    )
    plt.show()

## generate_numerical_categorical_heatmap()
- inputs: dataframe
- returns: None
- description: generates heatmap for numerical and catergorical features (column) of the dataset
 heatmap produces by calculate the correlation/strength-of-association of features in data-set with both categorical and continuous features using: 
    - Pearson's R for continuous-continuous cases 
    - Correlation Ratio for categorical-continuous cases 
    - Cramer's V or Theil's U for categorical-categorical cases
- remarks: More info on the library checkout [dython](http://shakedzy.xyz/dython/modules/nominal/)

In [10]:
def generate_numerical_categorical_heatmap(
    dataframe: pd.DataFrame
) -> None:
    numerical_categorical_dataframe = dataframe.select_dtypes(exclude=['datetime64[ns]'])
    nominal.associations(dataset=numerical_categorical_dataframe, figsize=(15, 13), title="Correlation/Strength-of-association of features")

## generate_categorical_count_visualization()
- input: 
    - Dataframe
    - parameters 
- returns: None
- description: generates categorical plot for selected catergorical features (column) of the dataset
- remarks: parameter input is dictionary data that loads from parameters.yaml file which contains all the configuration required for the catergorical count visualization

In [11]:
def generate_categorical_count_visualization(
        dataframe: pd.DataFrame,
        visualization_parameters: List[str]
    ) -> None:

    visualization_dataframe = pd.DataFrame( dataframe[visualization_parameters] )

    for column in visualization_dataframe.columns:
        category_total_types = len(dataframe[column].value_counts())
        g = sb.catplot(y=column, data=visualization_dataframe, kind="count", height=category_total_types)

        for ax in g.axes.ravel():

            for c in ax.containers:
                ax.bar_label(c, label_type='edge')

        plt.title(f"Total count for all types available in '{column}' feature", fontsize=20)

## generate_numerical_vs_categorical_eda_visualization()
- inputs: 
    - dataframe
    - parameters
    - plot title
- returns: None
- description: generates categorical plot for every catergorical features (column) of the dataset
- remarks: parameter input is dictionary data that loads from parameters.yaml file which contains all the configuration required to generate eda visualization

In [12]:
def generate_numerical_vs_categorical_eda_visualization(
        dataframe: pd.DataFrame,
        visualization_parameters: Dict[str, str],
        plot_title: str
    ) -> None:

    total_features = len(visualization_parameters)

    figure, axes = plt.subplots(
        total_features, 
        1, 
        figsize=(20,10*total_features),
        constrained_layout=True
    )
    figure.tight_layout(pad=10.0)

    row = 0
    for numerical_feature, categorical_feature in visualization_parameters.items():
        axes[row].set_title(
            f"{categorical_feature} boxplot based on {numerical_feature}", 
            fontdict={'fontsize': 25, 'fontweight': 'medium'}
        )

        sb_plot = sb.boxplot(
            y=numerical_feature, 
            x=categorical_feature, 
            data=dataframe, 
            order=dataframe.groupby(categorical_feature)[numerical_feature].median().sort_values().index,
            ax=axes[row]
        )
        sb_plot.set_xticklabels(sb_plot.get_xticklabels(), rotation=40, ha='right')

        row = row + 1

## generate_catergorical_group_dataframe() [NOTE: PRIVATE FUNCTION]
- inputs: 
    - dataframe
    - observe_feature: numerical only
    - groupby_features: categorical only
    - observation_types: supported statistical types are ['sum', 'mean', 'median', 'min', 'max']
- returns: grouped dataframe
- description: generates grouped dataframe which can easily use pandas groupby method later on
- remarks: currently does not support multi groupby features

In [13]:
def generate_group_dataframe(
        dataframe: pd.DataFrame,
        observe_feature: str,
        groupby_features: List[str],
        observation_types: Dict[str, str]
    ) -> pd.DataFrame:
    
    grouped_dataframe: pd.DataFrame = dataframe.groupby(groupby_features, as_index=False)[observe_feature].aggregate(list(observation_types.keys()))

    return grouped_dataframe

## parse_time_series_parameters() [NOTE: PRIVATE FUNCTION]
- inputs: 
    - parameters
- returns: Tuple
    - observe_feature
    - time_feature
    - groupby_features
    - observation_types
    - visualization_features
    - grouping_features
- description: parse the time series parameters
- remarks: None

In [14]:
def parse_time_series_parameters(
        parameters: Dict[str, Any]
    ) -> Tuple[str, str, List[str], Dict[str, str], List[str], List[str]]:

    #parse parameter
    observe_feature: str = parameters['observe_feature']
    time_feature: str = parameters['time_feature']
    groupby_features: List[str] = parameters['groupby_features']
    observation_types: Dict[str, str] = parameters['observation_types']

    # for function input
    visualization_features: List[str] = [time_feature] + [observe_feature] + groupby_features
    grouping_features: List[str] = [time_feature] + groupby_features

    return observe_feature, time_feature, groupby_features, observation_types, visualization_features, grouping_features

## generate_categorical_group_time_series_visualization()
- inputs: 
    - dataframe
    - parameters
- returns: None
- description: generate time series graph based on **EVERY** group of a categorical feature
- remarks: currently does not support multi groupby features

In [15]:
def generate_categorical_group_time_series_visualization(dataframe: pd.DataFrame, parameters: List[Dict[str, Any]]) -> None:

    def generate_group_time_series_graph(
            dataframe: pd.DataFrame,
            time_feature: str,
            observe_feature: str,
            groupby_features: List[str],
            observation_types: Dict[str, str]
        ) -> None:

        group_list: List[str] = np.unique(dataframe[groupby_features].values).tolist()

        group_size: int = len(group_list)
        grouped_dataframe: DataFrameGroupBy = dataframe.groupby(groupby_features) 

        figure, axes = plt.subplots(group_size, 1, figsize = (20, 8*group_size), constrained_layout=True)
        
        for row, group_name in enumerate(group_list):
            group_dataframe = grouped_dataframe.get_group(group_name)
        
            axes[row].set_title(
                f"{group_name} {observe_feature} trend", 
                fontdict={'fontsize': 25, 'fontweight': 'medium'}
            )
            axes[row].set_xlabel(time_feature)
            axes[row].set_ylabel(observe_feature)

            for observation_type, observation_type_color in observation_types.items():
                sb.lineplot( 
                    x = time_feature, 
                    y = observation_type, 
                    color = observation_type_color, 
                    data = group_dataframe, 
                    ax = axes[row],
                    label =f'{observation_type} value'
                )

            axes[row].legend(loc='best')
    

    for parameter in parameters:
        
        (observe_feature, time_feature, groupby_features, observation_types, visualization_features, grouping_features) = parse_time_series_parameters(parameter)

        visualization_dataframe: pd.DataFrame = dataframe[visualization_features].copy()

        grouped_visualization_dataframe = generate_group_dataframe(visualization_dataframe, observe_feature, grouping_features, observation_types)
        generate_group_time_series_graph(grouped_visualization_dataframe, time_feature, observe_feature, groupby_features, observation_types)


## generate_categorical_time_series_visualization()
- inputs: 
    - dataframe
    - parameters
- returns: None
- description: generate time series graph based on **ALL** group of a categorical feature
- remarks: currently does not support multi groupby features

In [16]:
def generate_categorical_time_series_visualization(dataframe: pd.DataFrame, parameters: List[Dict[str, Any]]) -> None:
    
    def generate_categorical_time_series_graph(
            dataframe: pd.DataFrame,
            time_feature: str,
            observe_feature: str,
            groupby_features: List[str],
            observation_type: str = 'mean'
        ) -> None:

        group_list: List[str] = np.unique(dataframe[groupby_features].values).tolist()
        # print(f"group_list type: {type(group_list)}, content: \n{group_list}")

        grouped_dataframe: DataFrameGroupBy = dataframe.groupby(groupby_features) 

        figure, axes = plt.subplots(1, 1, figsize = (30, 20), constrained_layout=True)
        
        axes.set_title(
            f"Overall {observe_feature} trend", 
            fontdict={'fontsize': 25, 'fontweight': 'medium'}
        )
        axes.set_xlabel(time_feature)
        axes.set_ylabel(observe_feature)

        for group_name in group_list:
            group_dataframe: pd.DataFrame = grouped_dataframe.get_group(group_name)            

            sb.lineplot( 
                x = time_feature, 
                y = observation_type,
                data = group_dataframe, 
                ax = axes,
                label =f'{group_name}'
            )

            last_data: pd.DataFrame = group_dataframe.tail(1)
            axes.text(x=last_data[time_feature], y=last_data[observation_type], s=group_name, va="center")

        axes.legend(loc='best')


    for parameter in parameters:
        
        (observe_feature, time_feature, groupby_features, observation_types, visualization_features, grouping_features) = parse_time_series_parameters(parameter)

        visualization_dataframe: pd.DataFrame = dataframe[visualization_features].copy()

        grouped_visualization_dataframe = generate_group_dataframe(visualization_dataframe, observe_feature, grouping_features, observation_types)
        generate_categorical_time_series_graph(grouped_visualization_dataframe, time_feature, observe_feature, groupby_features)

## generate_time_series_linear_regression()
NOTE: 
- Need to find out if possible to generate linear_regression with catergorical variables
- The reason for this is because there is correlation between
    - resale_price vs [flat_type, flat_model]
    - remaining_lease vs [town, flat_model]
    - floor_area_sqm vs flat_type
- Need to accept list of parameters to generate different types of linear_regression lines for different response and predictors

links:
    - https://stackoverflow.com/questions/34007308/linear-regression-analysis-with-string-categorical-features-variables
    - https://investigate.ai/classification/scikit-learn-and-categorical-features/

- inputs: 
    - dataframe
    - parameters
- returns: None
- description: 
- remarks: currently does not support multi groupby features

In [17]:
def generate_time_series_linear_regression(dataframe: pd.DataFrame, parameters: List[Dict[str, str]]) -> None:
    
    def generate_linear_regression_model(
            group_dataframe: pd.DataFrame,
            group_name: str,
            predictor_feature: str,
            observation_type: str = 'mean'
        ) -> None:

        response_dataframe = pd.DataFrame(group_dataframe[observation_type])
        predictor_dataframe = pd.DataFrame(group_dataframe[predictor_feature])

        if response_dataframe.isnull().values.any():
            response_dataframe.fillna(0, inplace=True)
    
        # # # linear regression will have problems with categorical data!
        linear_regression = LinearRegression()
        linear_regression.fit(predictor_dataframe, response_dataframe)

        # # Coefficients of the Linear Regression line
        print(f"Linear regression model for '{group_name}:")
        print('Intercept of Regression \t: b = ', linear_regression.intercept_)
        print('Coefficients of Regression \t: a = ', linear_regression.coef_)
        print()

    def generate_linear_regression_graph(
        dataframe: pd.DataFrame,
        response_feature: str,
        predictor_feature: str,
        groupby_features: List[str],
        observation_type: str = 'mean'
    ) -> None:

        group_list: List[str] = np.unique(dataframe[groupby_features].values).tolist()
        grouped_dataframe: DataFrameGroupBy = dataframe.groupby(groupby_features) 
        print(f"grouped_dataframed columns: {dataframe.columns}")

        # figure, axes = plt.subplots(1, 1, figsize = (30, 30), constrained_layout=True)
        figure, axes = plt.subplots(1, 1, figsize = (30, 30))
        axes.set_title(
            f"Overall {response_feature} trend", 
            fontdict={'fontsize': 25, 'fontweight': 'medium'}
        )
        # axes.set_xlabel(predictor_feature)
        # axes.set_ylabel(response_feature)

        for group_name in group_list:

            

            group_dataframe: pd.DataFrame = grouped_dataframe.get_group(group_name).copy()
            # print(f"[1] group_dataframe content:\n{group_dataframe}")

            # response_dataframe = pd.DataFrame(group_dataframe[observation_type])
            # predictor_dataframe = pd.DataFrame(group_dataframe[predictor_feature])

            # if response_dataframe.isnull().values.any():
            #     response_dataframe.fillna(0, inplace=True)

            # # due to NOTE1 will dchange datetyime to ordinal
            # predictor_dataframe_ordinal = pd.DataFrame()
            # # predictor_dataframe_ordinal[predictor_feature]: pd.Series = pd.to_datetime(predictor_dataframe[predictor_feature]).apply(lambda date: date.toordinal)
            # predictor_dataframe_ordinal[predictor_feature]: pd.Series = predictor_dataframe[predictor_feature].apply(lambda x: x.toordinal())
            # print(f"[ordinal] predictor_dataframe type: {type(predictor_dataframe_ordinal)},\npredictor_dataframe:\n{predictor_dataframe_ordinal}")

            # # FOR TESTING
            # predictor_dataframe_ordinal_revert = pd.DataFrame()
            # predictor_dataframe_ordinal_revert[predictor_feature] = predictor_dataframe_ordinal[predictor_feature].apply(datetime.datetime.toordinal)
            # # predictor_dataframe_ordinal_revert[predictor_feature]: pd.Series = pd.to_datetime(predictor_dataframe_ordinal[predictor_feature], origin='ordinal')
            # print(f"[ordinal_revert] predictor_dataframe type: {type(predictor_dataframe_ordinal_revert)},\npredictor_dataframe:\n{predictor_dataframe_ordinal_revert}")

            # # # linear regression will have problems with categorical data!
            # linear_regression = LinearRegression()
            # linear_regression.fit(predictor_dataframe_ordinal, response_dataframe)

            # # Coefficients of the Linear Regression line
            # print('Intercept of Regression \t: b = ', linear_regression.intercept_)
            # print('Coefficients of Regression \t: a = ', linear_regression.coef_)
            # print()

            # # # NOITE1 TypeError: cannot perform __rmul__ with this index type: DatetimeArray
            # predictor_regression_line: pd.DataFrame = predictor_dataframe
            # prediction_regression_line: pd.DataFrame = linear_regression.intercept_ + linear_regression.coef_ * predictor_dataframe_ordinal
            # # # prediction_regression_line.columns[0] = observation_type
            # # print(f"[TYPE] predictor_regression_line: {type(predictor_regression_line)}, prediction_regression_line: {prediction_regression_line}")
            # print(f"prediction_regression_line content: {prediction_regression_line}")
            # prediction_regression_line.info()


            # plot
            # plt.plot(train_set,label='trainingSet')


            ################# SEABORN APPROACH1 ########################
            # group_dataframe.insert(group_dataframe.shape[1],'row_count', group_dataframe.index.value_counts().sort_index().cumsum())
            # print(f"[2] group_dataframe content:\n{group_dataframe}")

            # using inbuilt seaborn regplot          
            # lin_reg_fig = sb.regplot(
            #     # x = predictor_feature,
            #     x = 'row_count',
            #     y = observation_type,
            #     data = group_dataframe,
            #     ax = axes,
            #     label =f'{group_name} regression line',
            #     scatter=False
            # )
            # labels = [item.get_text() for item in lin_reg_fig.get_xticklabels()]
            # lin_reg_fig.set_xticklabels(labels)
            ################# SEABORN APPROACH 1########################


            ################# SEABORN APPROACH 2 ########################
            # group_dataframe[f"{predictor_feature}_ordinal"] = pd.to_datetime(group_dataframe[predictor_feature]).apply(lambda date: date.toordinal()) # element type <class 'numpy.int64'>
            # group_dataframe[f"{predictor_feature}_ordinal"] = group_dataframe[predictor_feature].apply(pd.Timestamp.toordinal) # method 2
            group_dataframe[f"{predictor_feature}_ordinal"] = group_dataframe[predictor_feature].apply(lambda date: date.toordinal()) # element type <class 'numpy.int64'>
            # print(f"[ORDINAL UPDATE] predictor_dataframe type: {type(group_dataframe[f'{predictor_feature}_ordinal'][0])},\npredictor_dataframe:\n{group_dataframe[f'{predictor_feature}_ordinal'][0]}")
            # print(f"[ORDINAL UPDATE] predictor_dataframe type: {type(group_dataframe)},\npredictor_dataframe:\n{group_dataframe}")

            generate_linear_regression_model(group_dataframe, group_name, f"{predictor_feature}_ordinal", observation_type)

            sb.regplot(
                data = group_dataframe,
                x = f"{predictor_feature}_ordinal",
                y = observation_type,
                ax = axes,
                scatter=False,
                ci=None,
                label =f'{group_name}',
            )

            # causes the graph to shrink
            # last_data: pd.DataFrame = group_dataframe.tail(1)
            # axes.text(x=last_data[f"{predictor_feature}_ordinal"], y=last_data[observation_type], s=group_name, va="center")

            axes.set_xlabel(predictor_feature)
            new_labels = [date.fromordinal(int(item)) for item in axes.get_xticks()]
            # new_labels = [pd.Timestamp.fromordinal(int(item)) for item in linear_regression_figure.get_xticks()] #method 2
            # print(f"labels: {new_labels}")
            axes.set_xticklabels(new_labels)
            ################# SEABORN APPROACH 2 ########################


            # sb.lineplot( 
            #     x = predictor_feature, 
            #     y = predictor_feature,
            #     data = prediction_regression_line, 
            #     ax = axes,
            #     label =f'{group_name}'
            # )
            
        #     new_labels = [date.fromordinal(int(item)) for item in axes.get_xticks()]
        #     axes.set_xticklabels(new_labels)

            # this line should produce the price_per_sqm trend (wavy lines) -> works
            # sb.lineplot( 
            #     # x = predictor_feature, 
            #     x = f"{predictor_feature}_ordinal",
            #     y = observation_type,
            #     data = group_dataframe, 
            #     ax = axes,
            #     # label =f'{group_name}'
            # )

            # last_data: pd.DataFrame = group_dataframe.tail(1)
            # axes.text(x=last_data[predictor_feature], y=last_data[observation_type], s=group_name, va="center")

        axes.legend(loc='best')


    print(f"input parameters: {parameters}")

    for parameter in parameters:

        # parse parameter
        response_feature: str = parameter['response']
        predictor_feature: List[str] = parameter['predictors']
        groupby_features: List[str] = parameter['groupby_features']
        observation_type: str = parameter['observation_type']
        linear_regression_model_features: List[str] = [predictor_feature, response_feature] + groupby_features

        # print(f"response: {response_feature}, predictors_feature: {predictor_feature}, linear_regression_model_features: {linear_regression_model_features}, groupby_features: {groupby_features}, observation_type: {observation_type}")

        linear_regression_model_dataframe: pd.DataFrame = dataframe[linear_regression_model_features].copy()
        
        grouping_features: List[str] = [predictor_feature] + groupby_features
        grouped_linear_regression_model_dataframe: pd.DataFrame = linear_regression_model_dataframe.groupby(grouping_features, as_index=False)[response_feature].aggregate([observation_type])
        # print(f"type: {type(grouped_linear_regression_model_dataframe)} vs content:\n{grouped_linear_regression_model_dataframe}")
        # print(grouped_linear_regression_model_dataframe) # okay!

        generate_linear_regression_graph(grouped_linear_regression_model_dataframe, response_feature, predictor_feature, groupby_features, observation_type)