In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import unittest
import itertools
from IPython.display import display, HTML
from matplotlib import cm
import matplotlib.colors as mcolors
from matplotlib import ticker
from matplotlib.ticker import MaxNLocator
import matplotlib as mpl
import os
import copy
from pathlib import Path
import plotly.express as px
import datetime
import glob
import sys
from io import StringIO
from dateutil.relativedelta import relativedelta


from patsy import dmatrix
import statsmodels.api as sm
import statsmodels.formula.api as smf
font = {'size'   : 10}

plt.rc('font', **font)
#mpl.rcParams['figure.dpi']= 300

from scipy.interpolate import make_interp_spline
from scipy.stats import chi2_contingency

from ipynb.fs.full.data_prep import causes, mtav_causes, inj_causes, pmrc_sql, mech_defect, mech_failed
trend_df_lst = []

  machar = _get_machar(dtype)


The size of the notebook is: 12.17 KB


In [2]:
'''
multi_sig_trend_graph: 
inital call for sig trend analysis. 
input:
        df dataframe: one of the main dataframe - either injury, accident, MTAV or PMRC
        analysis_type string: determines which folder the graphs will be deposited in,
                              also the type of data the analysis is conducted on.
        branch int > 0: Unnecessary variable, if we want single filter we will just specify
                        number of branching.
        p_threshold float > 0 : the p value threshold for a trend to be significant.
        auto_filter: list of predetermined filters, bypass user selection if not None.
        start_date datetime: the earliest date to be filtered on df.
        end_date datetime: the latest date to be filtered on df

output:
if branch = 0, the user select one of the filter and function calls sig_trend_graph
if branch > 0, the user select more than one filter and function calls apply_next to expand
                the filter branch
'''
def multi_sig_trend_graph(df, analysis_type, branch, p_threshold, auto_filter = None, start_date = "None", end_date = "None"):
    print("multi_sig_trend_graph")
    print(f"analysis_type -> {analysis_type}")
    print(f"branch -> {branch}")
    print(f"p_threshold -> {p_threshold}")
    print(f"current auto_filter ->{auto_filter}")
    print(f"date range -> {start_date} - {end_date}")
    
    ## setting up graph_name_lst to be populate by apply_next later
    ## this is the return value of the function
    graph_name_lst = []

    ## if auto_filter is None, then it is manual input, and branching need to be specified
    branch = len(auto_filter) if auto_filter != None else branch


    ## if there are no start/end data specified
    ## has never happenend in my usual function calls
    ## can probably make data range specification mandatory
    if start_date == "None":
        start_date = df["Date_actual"][0]
    if end_date == "None":
        end_date = df["Date_actual"].tail(1).values[0]
    
    df = df.loc[df["Date_actual"] >= start_date]
    df = df.loc[df["Date_actual"] <= end_date]
    df = df.drop(columns = ["Date_actual"])
    
              
    data = df.copy()
    choice_lst = []

    ## specifies the number of filteres to be applied on the dataset
    br_num = input("Number of branching: ") if auto_filter == None else str(len(auto_filter))

    ## check if it is manual selection, and if the input is valid.
    if ord(br_num) <= 57 and ord(br_num) >= 48 and auto_filter == None:
        br_num = int(br_num)
        for i in reversed(range(br_num)):
            valid = False
            while valid == False:
                ## given the amount of filters by user, specify which filter
                choices = input("Select {0} from the list above".format(i+1))
                print(choices)
                if choices not in list(data.columns.values):
                    print("The choice is not valid")
                else:
                    index = list(data.columns.values).index(choices)
                    choices_tuple = (choices, index)
                    
                    ## only getting choice name, not adding the tuple
                    choice_lst.append(choices_tuple[0])
                    valid = True
    ## Num of branching = length of list of columns in auto_filter
    else:
        ## assign choice_lst to the copy of auto_filter so the pop() does not affect the original
        choice_lst = auto_filter[:]
                    
    ## Expand based on branching candidates
    ## cause_count_iter on the branches specified
    choice_lst_iter = choice_lst.copy()
    choice_category = '_'.join(choice_lst.copy())
    
    ## PMRC Specific - includinh passed, failed, tota; for ratio calculation.
    choice_lst_iter.extend(["Date", "passed", "failed", "pmrc_total"]) if 'pmrc_total' in list(data.columns.values) else choice_lst_iter.extend(["Date", "count"])

    ## dat_small only contains columns of interest
    dat_small = data[choice_lst_iter]
    source_df = [dat_small]

    ## string of all the filters used for this analysis
    filter_str = "-".join(choice_lst)

    print(f"apply_next filtering {filter_str}")
    ## applies consecutive/hierarchical filtering based on choice_lst
    while choice_lst:
        curr_filter =  choice_lst.pop(0)
        res = apply_next(source_df, "filter", curr_filter = curr_filter) ## --- source_df in first call is a dataframe
                                                                         ## --- call > 1, its a list of dataframes
        source_df = res

    ## source_df[0] b/c apply_next filter puts a extra list on top for some reason
    source_df = source_df[0]

    print(f"apply_next pivoting {filter_str}")
    ## pivot table so date is index with old row values as columns, count of old row values as new row values.
    ## applies to each of the 
    pivot_df = apply_next(source_df, "pivot", start_date = start_date, end_date = end_date)

    
    print(f"apply_next graphing {filter_str}")
    #print("pivot df being fed to apply next graph")
    #print(pivot_df)
    graph_name_lst = apply_next(pivot_df, "graph", graph_type = analysis_type, graph_cat = choice_category, graph_name_lst = graph_name_lst)

    return (graph_name_lst, filter_str)
    

        
                        
            
                
        
        
'''
apply_next:
Does recursive call to filter/pivot hierarchically
input:
        source_df : df that has been shortened to only include columns of interest.
        apply_type: one of filter/pivot/graph.
        curr_filter: only when apply_type == filter else None, the current filter being applied in this recursion call
        graph_type: only when apply_type == graph else None, one of accident/injuries/MTAV/PMRC. determines save location of graph
        graph_cat: only when apply_type == graph else None, appended string of the hierarchical filters to track the filters(columns) used
        start_date / end_date: only when apply_type == pivot, specifies the date range which will be used as index in the pivot
        graph_name_lst: only when apply_type == graph, collect the graph_cat of every hieriarchical combination of the filters

output:
        if apply_type == filter: returns source_df that is a layered list of dataframes. Where the layer of list corresponds to the number o hierarchical filters
        if apply_type == pivot: returns a pivoted dataframe with data as index, grouped by row values of source_df's columns
        if apply_type == graph: returns a trend analysis graph given by the pivoted df of apply_next(pivot)
        
'''
def apply_next(source_df, apply_type, curr_filter = "None", graph_type = "None", graph_cat = "None", start_date = "None", end_date = "None", graph_name_lst = [],):
    if apply_type == "pivot":
        for i in range(len(source_df)):
            if type(source_df[i]) == list:
                apply_next(source_df[i], "pivot", start_date = start_date, end_date = end_date)
            else:
                ## at the last level of the filter, cause_count2 will go through every column and pivot.
                ## the node level df has uniform values for all columns, so cause_coun2 will just be creating the same pivot df with different column names
                ## this is creating new column with all of the column's name merged, so we know which filter we are currently in
                df = source_df[i].copy()

                ## pmrc combo name position is different than inj and accident
                pmrc_combo = df[df.columns[:-4]].apply(lambda x: "_".join(x.dropna().astype(str)), axis = 1)
                other_combo = df[df.columns[:-2]].apply(lambda x: "_".join(x.dropna().astype(str)), axis = 1)
                df["combo_name"] = pmrc_combo if 'pmrc_total' in list(source_df[i].columns.values) else other_combo
                
                source_df[i]["combo_name"] = df["combo_name"]

                ## passed, failed, total needs to be included bc we need to calculate the ratio in the pivot step
                ## WILL NEED TO BE REMOVED LATER
                pmrc_bool = True if 'pmrc_total' in source_df[i].columns.values else False
                source_df[i] =  source_df[i][["Date", "passed", "failed","pmrc_total", "combo_name"]] if pmrc_bool else source_df[i][["Date", "count", "combo_name"]]


                #print("Currently Pivoting {}".format(combo_name))

                ## PMRC Failure Threshold
                source_df[i] = find_sig(cause_count2(source_df[i], "month", start_date, end_date), 25) if pmrc_bool else find_sig(cause_count2(source_df[i], "month", start_date, end_date), 12)
        return source_df
        
                
    elif apply_type == "filter" and curr_filter != "None":
        for i in range(len(source_df)):
            ## IF ITS LIST, RECURSE UNTIL HITS A DATAFRAME
            if type(source_df[i]) == list:
                apply_next(source_df[i], "filter", curr_filter)
            ## REACHED LAST LEVEL - DATAFRAME. USE EVERY UNIQUE ELEMENT OF COLUMN(CURR_FILTER) AS FILTER.
            ## CREATING A LIST OF FILTERED DATAFRAME FROM CURRENT DATAFRAME
            else:
                source_df[i] = iter_filter(source_df[i], curr_filter, 12)

        return source_df
                
    elif apply_type == "graph" and graph_type != "None":
        for i in range(len(source_df)):
            if type(source_df[i]) == list:
                ## apply_next graph is being called here too in recursion, it is not onlg being called once per graph
                apply_next(source_df[i], "graph", graph_type = graph_type, graph_cat = graph_cat, graph_name_lst = graph_name_lst)
            else:
                combo_name = '_'.join(list(source_df[i].columns.values))

                ## sig_trend_graph determines if current dataset is significant and returns string
                ## graph_name_lst appends the returned string
                ## graph_name_lst declared once in multi_trend_graph
                ##                repeatedly called in apply_next graph
   
                graph_name_lst.append(sig_trend_graph(source_df[i], 0.1, graph_type, graph_cat, combo_name = combo_name))

        if (graph_name_lst != []):
            return graph_name_lst
                
    
    else:
        print("Type not supported")

    

'''
compare_result:
input:
        curr(lst): list of sig or inisg graphs of current period
        prev(lst): list of sig or insig graphs of previous period
        analysis_type(str): one of accident/injury/MTAV/PMRC
        filters: list of filters being used for the graph
'''
def compare_result(curr, prev, analysis_type, filters):

    diff = list(set(curr) - set(prev))
    
    curr_df = (pd.DataFrame(curr, columns = ['name'])
               .assign(category = analysis_type)
               .assign(filters = filters)
             )
    
    curr_df = (curr_df
               .assign(significant = np.where(curr_df['name'].str[:2] == 'in', "No", "Yes"))
               .assign(trend = np.where(curr_df['name'].str[:10].str.contains('up'), "up", "down"))
               .assign(new_trend = np.where(curr_df['name'].isin(diff), "Yes", "No"))
               .assign(country = np.where(curr_df['name'].str[:17].str.contains('Canada'), "Canada", "USA"))
              )

    return curr_df


'''
auto_filter:
takes a list of filters(columns) and apply the trend analysis process
input:
        lst: list of lists, where each list contains the filters of each call
        ret_lst: returned list of dataframe, each dataframe is from compare_result
        source_df: Dataframe produced from original enablon export of the csv files
        analysis_type_curr: current quarter's folder name which the graphs will be saved to. 
        analysis_type_prev: previous quarter's folder name which the graphs will be saved to. 

Output:
        populates ret_lst. which is a global list that keeps track of all of the trend analysis runs.
'''
def auto_filter(lst, ret_lst, source_df, analysis_type_curr, analysis_type_prev, start_date, end_date):
    curr_end_date = end_date
    prev_end_date = end_date - relativedelta(months = 3)
    for filters in lst:
        curr_name_lst, filter_str = multi_sig_trend_graph(source_df, analysis_type_curr, 1, 0.1, start_date = start_date, end_date = curr_end_date, auto_filter = filters)
        prev_name_lst, filter_str = multi_sig_trend_graph(source_df, analysis_type_prev, 1, 0.1, start_date = start_date, end_date = prev_end_date, auto_filter = filters)
        ret_lst.append(compare_result(curr_name_lst, prev_name_lst, analysis_type_curr, filter_str))




'''
test_df checks if the sum of each categories of the modified df still adds up to the orignal
input: 
      source_df: the dataframe created from reading the original csv file
      mod_df: the dataframe returned from cause_count2
'''
def test_df(source_df, mod_df):
    for i in mod_df:
        col_name = i.columns.name
        check_lst = i.columns.values
        total_lst = []
        for name in check_lst:
            check_df = source_df[source_df[col_name] == name].reset_index(drop = True)
            check_num = len(check_df)
            test_num = int(sum(i[name]))
            if test_num == check_num:
                print("pass")
            else:
                print("{0} from column {1} is not correct".format(name, col_name))
                raise ValueError('Total Values Does Not Match')
                


'''
date_as_zero(df, lst) : quantifies the non-existing month occurrence with zero.
                        solves the problem of missing months in visualization.
input: 
        df - a single dataframe that is an ouput of cause_cause2
        lst - a date template from 2020/01 - 2024/06
ouput:
        df - a sorted dataframe with consecutive months as index
'''
def date_as_zero (df, lst):
    print("Creating zero date placeholder...")
    for i in lst:
        if (i not in list(df.index)):
            df.loc[i] = [0] * len(df.columns.values)
    return df.sort_index()
    
'''
iter_filter(dataframe, string, int):
helper function for apply_next type == filter

input: dataframe(df) - is an unfiltered df
       colname(Str) - current column to be filtered on the dataframe
       signum(Int) - the number of occurrence deemed to be significant
       
output: a list of dataframes, where each dataframe has been filtered by a value in the column.
'''
def iter_filter(df, colname, signum):
    lst_df = []

    idx = df[colname].drop_duplicates().reset_index(drop = True)
    for i in idx:

        ## isolate each element of the filter group
        filtered_df = df[df[colname] == i].reset_index(drop = True)
        
        lst_df.append(filtered_df)
        
    return lst_df
    


'''
cause_count2(df):
helper function for apply_next type == pivot

input: 
        df - the causes dataframe created during the data cleaning step

output:
        returns a list of dataframes where:
            1. each element of the list is a level of causes
            2. each df has type of causes as columns, date (2021/01 - 2022-12) as rows.
            3. each df's values are count of occurrences of that type of cause during that date.

Note: compared to case_count, this function better utilizes pandas functionality with shorter and cleaner code.
'''
def cause_count2 (df, period, start_date, end_date):
    
    date_template = pd.date_range(start_date,end_date, 
              freq='MS').strftime("%Y-%m").tolist()

    col_lst = list(df.columns.values)
    col_len = len(df["Date"]
                  .copy()
                  .drop_duplicates()) ## getting how many months the date range covers.
    
    ## col_lst will only include user selected filters, which one be used as the group and pivot
    rm_lst = ['passed', 'failed', 'pmrc_total', 'count', 'Date', 'Quarter']
    col_lst = [x for x in col_lst if x not in rm_lst]
    
    grouped_causes = []

    if 'pmrc_total' in list(df.columns.values):
        for i in col_lst: ## going through the user selected filters

            grouped = (df
                        .sort_values([i, "Date"]).reset_index(drop = True)
                        .groupby(by=[i, "Date"], as_index = False).agg({
                            'passed': 'sum',
                            'failed':'sum',
                            'pmrc_total' : 'sum'
                        })
                        .reset_index()
                        .assign(passed_rate = lambda x : x['passed'] / x['pmrc_total'] * 100)
                        .assign(failed_rate = lambda x : x['failed'] / x['pmrc_total'] * 100)
                        .fillna(0)
                        .pivot(index = "Date", columns = i, values = 'failed_rate')
                      )

            grouped_causes.append(grouped) if not grouped.empty else print("{} is a empty pivot".format(i))
    else:
        for i in col_lst: ## going through user selected filter
            grouped = (df[[i, "Date", "count"]]
                        .sort_values([i, "Date"]).reset_index(drop = True)
                        .groupby(by=[i, "Date"], as_index = False).sum()
                        .pivot(index = "Date", columns = i, values = 'count')
                        .fillna(0))
            grouped_causes.append(grouped) if not grouped.empty else print("empty pivot")
            
        
    ## None occurrence as rows of zeroes
    for i in range(len(grouped_causes)):
        
        if 'LCR' in grouped_causes[i].columns[0] or 'Efficiency Test' in grouped_causes[i].columns[0]:
            print(grouped_causes[i].columns[0])
            print("PMRC Ratios, no date as zero")
        else:
            grouped_causes[i] = date_as_zero(grouped_causes[i], date_template)
    
        
    combined_df = pd.concat(grouped_causes) if grouped_causes != [] else print("{} is a empty pivot".format(i))

    return grouped_causes
    
    
'''
find_sig(lst, int) : 
given a significance parameter, remove all columns of data that is below it.

input: 
        cumu_df: list of dataframes from apply_next pivot
        sig: int, determines the cut off point for significant data points

output: list of dataframes of less or equal len of input
'''
def find_sig(cumu_df, sig):
    print("Removing insiginificant columns from list of pivoted df")
    
    cumu_copy = cumu_df.copy()
    dropped_lst = []
    master_df = pd.DataFrame()

    ## iterating through each df in the list of df
    ## each df has dim[# of months in date range:# of unique element of unpivoted column]
    for idx in range(len(cumu_copy)):
        df = cumu_copy[idx]
        ## iterating each column of the df
        for cols in df.columns.values:

            ## if it is PMRC, we will look at the average fail%
            ## if it other types, we are looking at total count
            ## set sig as 5% if its LCR 
            compare_val = np.average(df[cols]) if df[cols].dtype == float else sum(df[cols])
            sig_tmp = 10 if "LCR" in cols else sig
        
            ## if less than sig -> not enough data, df drops the column
            print("ratio compare_val due to audits or PMRC") if df[cols].dtype == float else print("Count based compare_val for accident/injuries/MTAV")
            print("Here is the value form compare_val")
            print(compare_val)
            print("Here is the value form sig")
            print(sig)
            print("Here is sig_tmp")
            print(sig_tmp)
            print("combo_name")

            if compare_val < sig_tmp or len(df[cols]) <= 10:
                print("This is len of df[cols]")
                print(len(df[cols]))
                print("{0} Dropped".format(cols))
                df = df.drop(columns = cols)
            else:
                print("nothing was dropped")
                
        ## keep track of the dropped columns
        dropped_lst.append(cols)

        ## replace the original df with the signified df
        cumu_copy[idx] = df
        
        ## in case we dont want it in a list of dataframe structure, but with everything in one df
        master_df = pd.concat([master_df, df], axis = 1)

    ## after removing insignificant columns, spot and remove any empty df
    empty_df_idx = []
    for i in reversed(range(len(cumu_copy))):
        if cumu_copy[i].empty:
            empty_df_idx.append(i)
    for i in empty_df_idx:
        del(cumu_copy[i])

    #print("result from find_sig")
    #print(cumu_copy)
    return cumu_copy

        

'''
clean_str(str)
special characters causes error when putting it through linear regression function
treating the accident/injuries/MTAV cause names.

input: 
        str: a string which is the name of the data
output:
        changes all other ascii character aside from letters and numbers into _. 
        to avoid errors from the sm.ols linear regression model
'''
def clean_str(text):
    print("cleaning graph string names...")
    replace1 = list(range(32, 48))
    replace2 = list(range(58, 65))
    replace3 = list(range(91,97))
    
    ## there may be occurrence where the ASCII code is out of bound
    text = str(text)
    for i in text:
        if ord(i) > 127:
            text=text.replace(i, "_")
    
    replace_lst = replace1 + replace2 + replace3
    repl_chars = ''.join(chr(i) for i in replace_lst)
    for c in repl_chars:
        text = text.replace(c, "_")
        
    if ord(text[0]) in list(range(48,57)):
        first_char = chr(ord(text[0]) + 17)
        text = first_char + text[1:]
        return text
    else:
        return text

'''
sig_trend_graph(df, int, str, str)


input: 
        df: dataframe that has been processed by cause_count (Pivoted) 
            with dim[# of months in date range: # of unique elements from unpivoted column]
        p_threshold: p-value boundary that distinguishes between significance and non-significance
        analysis_name: the name of the data which the graph is being produced from
        
        
Usage: used to produce a single graph. All filters must be already be applied to the df. 
'''

def sig_trend_graph(df, p_threshold, analysis_type, analysis_cat, combo_name = "None"):
    
    #print(df)
    print(f"Graphing...{combo_name}")

    data = df.copy()
    
    for colname in df.columns.values:
        data = df.copy()
        q_data = df.copy()

        ## clean_str changes special characters into _ to prevent bugs from OLS
        new_colname = clean_str(colname)
        data = data.rename(columns={colname:new_colname})

        
        y_name = new_colname

        ## making the df that would be plotted
        y = data[new_colname]
        data["Date"] = list(data.index)
        data["Date"] = data["Date"].apply(lambda x: x[2:])
        data["Date_num"] = np.arange(0,len(data['Date']), 1)

        
        ## fitting a linear regression model 
        data = data[[new_colname, "Date", "Date_num"]]
        formula =('{0} ~ Date_num'.format(y_name)) ## y_name seems unnecessary here, just use new_colname
        model = smf.ols(formula = formula, data = data)
        res_model = model.fit()
        model_coeff = res_model.summary().tables[1].as_html()
        df_summary = pd.read_html(StringIO(model_coeff),header=0,index_col=0)[0]
        b_0 = round(df_summary["coef"]["Intercept"],2)
        b_1 = round(df_summary["coef"]["Date_num"],2)
        mean = round(np.mean(data[new_colname]),2)
        med = round(np.median(data[new_colname]),2)
        p_val = round(df_summary["P>|t|"]["Date_num"],3)
        textstr = '\n'.join((
            "mean = {0}".format(mean),
            "median = {0}".format(med),
            "P-value = {0}".format(p_val),
            "Slope = {0}".format(b_1)))
        

        ## Add Quarter as a column for quarter step graph
        quarter_data = (q_data
                        .assign(Date = lambda x: pd.to_datetime(x.index))
                        .assign(quarters = lambda x: x["Date"].dt.to_period("Q"))
                        .groupby("quarters")[colname].mean().reset_index()
                       )

        ## add quarter to data df
        data = (data
                .assign(Date = lambda x: pd.to_datetime(x.index))
                .assign(quarters = lambda x: x["Date"].dt.to_period("Q"))
                .assign(Date = lambda x: x["Date"].dt.strftime("%y-%m"))
                )

        ## join to give each month of a quarter the same quarter average
        ## for plotting a straigt line for all months in a quarter.
        data = (pd
            .merge(data, quarter_data, on = 'quarters', how = 'left')
               )
        y_q = data.iloc[:, -1]



        #print("quarter_data_development")
        #print(quarter_data)

        #print("data without quarters")
        #print(data)
        
        x = data["Date_num"]
        lowess = sm.nonparametric.lowess
        z = lowess(y, x, frac = 0.35)
        x_ = z[:,0]
        y_ = z[:,1]
        fig, ax = plt.subplots(figsize=(20, 6))
        ## plotting the points
        _ = ax.plot(data["Date"] , y, '-o', mfc='b', alpha = 0.3)
        ## plotting line of best fit
        _ = ax.plot(data["Date_num"], b_0 + b_1 * np.array(data["Date_num"]), color = 'red')
        ## plotting the lowess smoothed trend
        _ = ax.plot(x_, y_)
        ## plotting the quarterly step
        _ = ax.plot(data["Date_num"], y_q, alpha = 0.3)
        _ = ax.yaxis.set_major_locator(MaxNLocator(integer = True))
        _ = ax.fill_between(
            data["Date_num"],
            y_q,
            color = 'skyblue',
            alpha = 0.2
        )

        _ = ax.set_ylim(0, max(y) + 5)
        _ = ax.set_title(y_name, fontsize=20)
        _ = ax.set_ylabel('%Failed', fontsize=12) if analysis_type == "pmrc_py" else ax.set_ylabel('Count', fontsize=12)
        _ = ax.set_xlabel('Date in Months', fontsize=12)
        _ = ax.text(0.9, 0.95, textstr, transform=ax.transAxes, fontsize=14,
                verticalalignment='top')


        ## NEED TO CHANGE this for PMRC where months are not consecutive
        for label in ax.xaxis.get_ticklabels()[::2]:
            label.set_visible(False)
            
        
        if combo_name != "None":
            y_name = clean_str(combo_name)
        
        ## adding slope info to the graph name
        slope = "up" if b_1 > 0 else "down"
        
        if p_val > p_threshold:
            path_str = 'graphs/{0}/{1}'.format(analysis_type,analysis_cat)
            pathname = Path(path_str)
            pathname.mkdir(parents = True, exist_ok = True)
            graph_name = "insig_{0}_".format(slope) + y_name
            #insig_lst.append(graph_name)
            plt.savefig(Path(path_str + "/" + graph_name + ".jpg"), bbox_inches = 'tight')
            plt.close()
        else:
            path_str = 'graphs/{0}/{1}'.format(analysis_type,analysis_cat)
            pathname = Path(path_str)
            pathname.mkdir(parents = True, exist_ok = True)
            graph_name = "sig_{0}_".format(slope) + y_name
            #sig_lst.append(graph_name)
            plt.savefig(Path(path_str + "/" + graph_name + ".jpg"), bbox_inches = 'tight')
            plt.close()
            
        
        data.iloc[:, :-2].to_csv(Path(path_str + "/" + graph_name + ".csv"))
        return(graph_name)
        
    





            
    
            

        

In [3]:
# Get the current notebook's filename
notebook_filename = "utils.ipynb"  # Replace with your notebook's name

# Get the size of the notebook
size = os.path.getsize(notebook_filename)

# Convert size to kilobytes (KB) for readability
size_kb = size / 1024

print(f"The size of the notebook is: {size_kb:.2f} KB")

The size of the notebook is: 35.88 KB
