In [1]:
#imports 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.plotly as py
import cufflinks as cf
import seaborn as sns
import urllib.parse
from glob import glob
import os
import ntpath 
import zipfile
from datetime import datetime

#and enable the offline mode in the notebook (not needed in Jupyter Lab)
#py.offline.init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')

In [2]:
def PD_patients_selection(df):
    '''EDA: select only PD patients'''
    folder_path='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/PPMI-final-dataset-382018/1-Subject-Characteristics/'
    files_names=reading_csv_files(folder_path)[1]
    dict_files=reading_csv_files(folder_path)[2]

    ### Patient_Status - actually useful for selecting only the PD patients in each table
    PRODROMA=dict_files[files_names[0]]
    PD_cat=['PD','REGPD','GENPD']
    PD_patients=list(PRODROMA.loc[PRODROMA['RECRUITMENT_CAT'].isin(PD_cat),'PATNO'])
    sub_df=df.loc[df['PATNO'].isin(PD_patients),:]
    return (PD_patients,sub_df)

def patients_sampling(df,sample_size=25,seed=123):
    np.random.seed(seed)
    sample=list(np.random.randint(0,number_of_patients(diagfeat_EADL)[0],sample_size+1))
    patients_sample=pd.Series(list(set(df['PATNO'])))[sample]
    patients_sample=list(patients_sample.values)
    sub_df=df.loc[df['PATNO'].isin(patients_sample),:]
    return sub_df

def plot_distr_visits_pat(df,fig_name):
    '''EDA: distribution of number of visits for the population'''
    # distribution of patients over the number of visits
    visits_pat=df.groupby('PATNO').size()
    plt.hist(x=visits_pat,bins=list(set(visits_pat.values)),normed=True,rwidth=0.5)
    plt_path_name='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/output/EDA/distr_visits_pat'
    plt_path_name=plt_path_name+fig_name
    plt.savefig(plt_path_name)

def plotting_feat(df,fig_name,y="sum_feat"):
    '''EDA: plotting timeseries of feature and feature distribution for some patients sample'''
    # first plot 
    sns.set(style="darkgrid")
    g = sns.FacetGrid(df, col="PATNO", margin_titles=True,col_wrap=5)
    g.map(plt.hist, y, color="steelblue")
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle('distribution of feature per number of visits')
    plt_path_name='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/output/EDA/hist_visits_feat'
    plt_path_name=plt_path_name+fig_name
    g.savefig(plt_path_name)
    plt.close()
    
    # second plot
    g = sns.FacetGrid(df, col="PATNO", margin_titles=True,col_wrap=5)
    g.map(plt.plot, y, color="steelblue")
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle('evolution of feature overtime by patient')
    plt_path_name='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/output/EDA/lineplt_DT_feat'
    plt_path_name=plt_path_name+fig_name
    g.savefig(plt_path_name)
    plt.close()
    
    # Third plot
    ax=sns.boxplot(y=y,x="PATNO", data=df)
    ax.figure.set_size_inches(w=20,h=10)
    plt_path_name='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/output/EDA/boxplot_PATNO_feat'
    ax.figure.suptitle('distribution of feature per patient')
    plt_path_name=plt_path_name+fig_name
    ax.figure.savefig(fname=plt_path_name)
    plt.close()

def dataset_balance_analysis(df,feat,fig_name):
    '''EDA: class disbution by event'''
    ser=df.groupby(by=['EVENT_ID',feat]).size()
    visits=list(ser.index.get_level_values(0))
    feat=list(ser.index.get_level_values(1))
    ax=sns.barplot(x=feat,y=list(ser.values),hue=visits)
    
    ax.figure.set_size_inches(w=20,h=10)
    ax.figure.suptitle('dataset class balance analysis by visit')
    
    plt_path_name='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/output/EDA/dataset_class-balance'
    plt_path_name=plt_path_name+fig_name
    ax.figure.savefig(fname=plt_path_name)
    plt.close()

def dataset_balance_analysis_allEvents(df,feat,fig_name,width=3):
    '''EDA: class distribution for the feature'''
    ser=df.groupby(by=feat).size()
    ax=plt.bar(x=list(ser.index),height=list(ser.values),width=width)
        
    plt_path_name='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/output/EDA/dataset_class-balance_allVisits'
    plt_path_name=plt_path_name+fig_name
    ax.figure.savefig(fname=plt_path_name)
    plt.close()

def heatmap_features(df,sel_feat,fig_name):
    '''correlation between features'''
    sns.set(style="white")
    
    print('there are {} features'.format(len(sel_feat)))
    d = df[sel_feat]

    # Compute the correlation matrix
    corr = d.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(45, 20))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    ax=sns.heatmap(corr, mask=mask, cmap=cmap, vmax=0.5, center=0,
                square=True, linewidths=1, cbar_kws={"shrink": .5})
    ax.figure.suptitle('correlation between features')
        
    # saving figure
    plt_path_name='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/output/EDA/features_heatmap'
    plt_path_name=plt_path_name+fig_name
    ax.figure.savefig(fname=plt_path_name)