In [1]:
#imports 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.parse
from glob import glob
import os
import ntpath 
import zipfile

In [None]:
def reading_csv_files(folder_path):
    '''PRE-PROCESSING: reading the csv file on a particular folder'''
    
    # saving all the csv files in a dataframe
    import os
    import glob
    # glob.glob('*.csv') #find all the csv files in a pathname. 
    os.chdir(folder_path)
    csv_files = [i for i in glob.glob('*.csv')]

    # Reading each csv file and storing them in a dictionnary containing the file name and the dataframe
    dict_files={}
    files_names=[]
    for files in csv_files:
        df=pd.read_csv(files,engine='python')
        basepath, filename=ntpath.split(files)
        files_names.append(filename)
        dict_files[filename]=df

    # Calculating the # of rows and features for each dataframe and storing them in a list of tuples
    shape_df=[]
    for i in range(0,len(dict_files.keys())):
        nrows, ncols=(len(dict_files[files_names[i]]),len(dict_files[files_names[i]].columns))
        shape_df.append((nrows,ncols))

    return (shape_df,files_names,dict_files)

def number_of_patients(df):
    '''PRE_PROCESSING: return the number of patients in each csv files'''
    if 'PATNO' in df.columns:
        patients=len(list(set(df['PATNO'])))
    else: 
        patients=0
    return patients

# to change by adding the max sequence length. 
def number_of_events(df): 
    '''PRE_PROCESSING: return the number of events covered by each csv file'''
    if 'EVENT_ID' in df.columns:
        events=len(list(set(df['EVENT_ID'])))
    else: 
        events=0
    return events

def features_selection(list_df_sel):
    '''PRE PROCESSING: do features selection for each df,sel in list_df_sel'''
    return df_sel

def patients_selection(df, threshold=6):
    '''PRE-PROCESSING: return a dataframe containing only the patients with number of visits > threshold'''
    if 'PATNO' in df.columns: 
        visits_number_by_pat=df.groupby('PATNO').size().sort_values(ascending=False)
        mask_sel=visits_number_by_pat>=threshold
        patients_sel=list(mask_sel[mask_sel.values==True].index)
        df=df.loc[df['PATNO'].isin(patients_sel),:]
        df=df.sort_values('PATNO')
        return (patients_sel, df)
    else:
        return 0

def padding_cropping_analysis(df,input_timesteps=4,th_drop=2):
    
    '''Return the number of crops, pads, drops to do in function of the length of the input 
    sequence and the threshold of pads allowed'''
    
    ## Write some assert statement for this function
    visits_number=df.groupby('PATNO').size().sort_values(ascending=False)

    patients_cropping=visits_number[visits_number>input_timesteps].to_frame().rename(columns=dict(zip([0],['visits_num'])))
    patients_nothing=visits_number[visits_number==input_timesteps].to_frame().rename(columns=dict(zip([0],['visits_num'])))
    patients_dropping=visits_number[visits_number<input_timesteps-th_drop].to_frame().rename(columns=dict(zip([0],['visits_num'])))
    pad_mask=((visits_number>=(input_timesteps-2)) & (visits_number<input_timesteps))
    patients_padding=visits_number[pad_mask].to_frame().rename(columns=dict(zip([0],['visits_num'])))

    patients_cropping['processing_op']='cropping'
    patients_cropping['op_num']=patients_cropping['visits_num']-(input_timesteps)
    num_crops=patients_cropping['op_num'].sum()
    
    patients_nothing['processing_op']='nothing'
    patients_nothing['op_num']=patients_nothing['visits_num']-(input_timesteps)
    
    assert patients_nothing['op_num'].sum()==0
    
    patients_dropping['processing_op']='dropping'
    patients_dropping['op_num']=(input_timesteps-2)-patients_dropping['visits_num']
    num_drops=patients_dropping['op_num'].sum()
    
    patients_padding['processing_op']='padding'
    patients_padding['op_num']=(input_timesteps)-patients_padding['visits_num']
    num_pads=patients_padding['op_num'].sum()

    df_visits=pd.concat([patients_dropping,patients_padding,patients_nothing,patients_cropping],axis=0)
  
    #operations_num=df_visits.groupby('processing_op').size()
    #index=list(operations_num.index)
    #index_values=[operations_num[i] for i in index]
    #num_pat=dict(zip(index,index_values))
    
    return (df_visits, dict(zip(['drops','pads','crops'],[num_drops,num_pads,num_crops])))

def table_analysis(list_df,df_names,threshold=6,input_timesteps=4,th_drop=2): 
    '''PRE-PROCESSING: return a dataframe with: 
    -as rows: list of df
    -as columns: 
        - total number of observations=rows
        - total number of features=columns after features_selection - features_selection function 
        - number of events
        - number of PD patients by using PRODROMA info - number_of_patients function
        - number of patients with # of visits > threshold - number_of_patients function
        - shape of final df after selections of rows and columns
        -number of NaN values in the final df
        NB: takes as input the df with features selection. 
    '''   
    # initialisation df
    col=['observations_tot','features','events_num','patients_num',
         'patients_sel','len_df_sel','drops','pads','crops','new_len','NaN_values']
    df_df=pd.DataFrame(columns=col,index=df_names)

    # remplissage of dataframe for each row
    for i,df in enumerate(list_df):
        
        df_df.iloc[i,:]['observations_tot']=len(df)
        df_df.iloc[i,:]['features']=len(df.columns)
        df_df.iloc[i,:]['events_num']=number_of_events(df)
        df_df.iloc[i,:]['patients_num']=number_of_patients(df)
        
        if not patients_selection(df)==0:
            df=patients_selection(df,threshold=threshold)[1]          
            df_df.iloc[i,:]['patients_sel']=len(set(df['PATNO']))
        else: 
            if 'PATNO' in df.columns:
                df_df.iloc[i,:]['patients_sel']=len(set(df['PATNO']))
            else:
                df_df.iloc[i,:]['patients_sel']=0            
        df_df.iloc[i,:]['len_df_sel']=len(df)
        df_df.iloc[i,:]['NaN_values']=(df.isnull().sum().sum())/(df_df.iloc[i,:]['len_df_sel']*df_df.iloc[i,:]['features'])
        
        
        if 'drops' in list(padding_cropping_analysis(df)[1].keys()):
            df_df.iloc[i,:]['drops']=padding_cropping_analysis(df,input_timesteps=input_timesteps,th_drop=th_drop)[1]['drops']
        else:
            df_df.iloc[i,:]['drops']=0
            
        if 'pads' in list(padding_cropping_analysis(df)[1].keys()):
            df_df.iloc[i,:]['pads']=padding_cropping_analysis(df,input_timesteps=input_timesteps,th_drop=th_drop)[1]['pads']
        else:
            df_df.iloc[i,:]['pads']=0
            
        if 'crops' in list(padding_cropping_analysis(df)[1].keys()):
            df_df.iloc[i,:]['crops']=padding_cropping_analysis(df,input_timesteps=input_timesteps,th_drop=th_drop)[1]['crops']
        else:
            df_df.iloc[i,:]['crops']=0
            
        df_df.iloc[i,:]['new_len']=input_timesteps*df_df.iloc[i,:]['patients_num']-df_df.iloc[i,:]['drops']

    return df_df

def INFODT_date(df):
    '''PRE_PROCESSING: add a datetime for the timeseries'''
    if 'INFODT' in df.columns: 
        df['INFODT_date']=df['INFODT'].apply(lambda x: datetime.strptime(x,'%m/%Y'))
    return df