In [1]:
#Cleaned data step by step and PERMANOVA trials


import pandas as pd
import seaborn as sns
import matplotlib 
from matplotlib import pyplot as plt
import numpy as np 
from scipy import stats 

In [2]:
from skbio.stats.distance import DistanceMatrix
from skbio.stats.distance import permanova
from skbio.diversity import beta_diversity 

In [3]:
wlb = pd.read_csv('Cities with the Best Work-Life Balance 2022.csv', delimiter= ',')

In [4]:
wlb.columns = wlb.columns.str.replace( ' ', '_').str.lower()

In [5]:
remote_job = wlb['remote_jobs']
remote_job = remote_job.str.replace('%', '', regex=False)
remote_job = remote_job.astype('float')
wlb['remote_jobs'] = remote_job

In [6]:
overwork = wlb['overworked_population']
overwork = overwork.str.replace('%', '', regex=False)
overwork = overwork.astype('float')
wlb['overworked_population'] = overwork

In [7]:
multi_jobs = wlb['multiple_jobholders']
multi_jobs = multi_jobs.str.replace('%', '', regex=False)
multi_jobs = multi_jobs.astype('float')
wlb['multiple_jobholders'] = multi_jobs

In [8]:
vacation_taken = wlb['vacations_taken_(days)']
vacation_taken = vacation_taken.replace('-', '0', regex=False)
vacation_taken = vacation_taken.astype('float')
wlb['vacations_taken_(days)'] = vacation_taken

In [9]:
parental_leave = wlb['paid_parental_leave_(days)']
parental_leave = parental_leave.str.replace(',', '.', regex=False)
parental_leave = parental_leave.astype('float')
wlb['paid_parental_leave_(days)'] = parental_leave

In [10]:
inflation_1 = wlb['inflation']
inflation_1 = inflation_1.str.replace('%', '', regex=False)
inflation_1 = inflation_1.astype('float')
wlb['inflation'] = inflation_1

In [11]:
wlb.drop(['2021', '2022', 'city'], axis=1, inplace=True)

In [12]:
heart = pd.read_csv('heart_attack_prediction_dataset.csv', delimiter= ',')

In [13]:
heart.columns = heart.columns.str.replace( ' ', '_').str.lower()

In [14]:
heart.drop(['patient_id', 'continent', 'hemisphere', 'blood_pressure'], axis=1, inplace=True)

In [15]:
merged_df = pd.merge( wlb, heart, on= 'country', how='left')

In [20]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11751 entries, 6 to 11824
Data columns (total 42 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   country                           11751 non-null  object 
 1   remote_jobs                       11751 non-null  float64
 2   overworked_population             11751 non-null  float64
 3   minimum_vacations_offered_(days)  11751 non-null  int64  
 4   vacations_taken_(days)            11751 non-null  float64
 5   unemployment                      11751 non-null  float64
 6   multiple_jobholders               11751 non-null  float64
 7   inflation                         11751 non-null  float64
 8   paid_parental_leave_(days)        11751 non-null  float64
 9   covid_impact                      11751 non-null  float64
 10  covid_support                     11751 non-null  float64
 11  healthcare                        11751 non-null  float64
 12  access_to

In [17]:
merged_df.dropna(inplace=True)

In [19]:
merged_df.isnull().sum()

country                             0
remote_jobs                         0
overworked_population               0
minimum_vacations_offered_(days)    0
vacations_taken_(days)              0
unemployment                        0
multiple_jobholders                 0
inflation                           0
paid_parental_leave_(days)          0
covid_impact                        0
covid_support                       0
healthcare                          0
access_to_mental_healthcare         0
inclusivity_&_tolerance             0
affordability                       0
happiness,_culture_&_leisure        0
city_safety                         0
outdoor_spaces                      0
air_quality                         0
wellness_and_fitness                0
total_score                         0
age                                 0
sex                                 0
cholesterol                         0
heart_rate                          0
diabetes                            0
family_histo

In [18]:
merged_df.shape

(11751, 42)

In [21]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11751 entries, 6 to 11824
Data columns (total 42 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   country                           11751 non-null  object 
 1   remote_jobs                       11751 non-null  float64
 2   overworked_population             11751 non-null  float64
 3   minimum_vacations_offered_(days)  11751 non-null  int64  
 4   vacations_taken_(days)            11751 non-null  float64
 5   unemployment                      11751 non-null  float64
 6   multiple_jobholders               11751 non-null  float64
 7   inflation                         11751 non-null  float64
 8   paid_parental_leave_(days)        11751 non-null  float64
 9   covid_impact                      11751 non-null  float64
 10  covid_support                     11751 non-null  float64
 11  healthcare                        11751 non-null  float64
 12  access_to

In [28]:
from scipy.stats import spearmanr, kendalltau
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [23]:
merged_numerical = merged_df.drop(columns=['country', 'sex', 'diet']) # Dropping categorical variables

In [29]:
merged_z = pd.DataFrame(scaler.fit_transform(merged_numerical), columns=merged_numerical.columns) # Z-score transformation (mean =0, std_dev =1, equal contributions for all features)

In [30]:
cor_merged_z = merged_z.corr()

In [31]:
cor_merged_z

Unnamed: 0,remote_jobs,overworked_population,minimum_vacations_offered_(days),vacations_taken_(days),unemployment,multiple_jobholders,inflation,paid_parental_leave_(days),covid_impact,covid_support,...,previous_heart_problems,medication_use,stress_level,sedentary_hours_per_day,income,bmi,triglycerides,physical_activity_days_per_week,sleep_hours_per_day,heart_attack_risk
remote_jobs,1.0,-0.419182,0.442254,0.25261,0.373653,0.145966,-0.296265,0.540334,0.282972,0.336208,...,0.009794,-0.021975,-0.004732,-0.003094,-0.006273,0.009339,-0.005301,-0.004808,0.008022,0.001001
overworked_population,-0.419182,1.0,0.050826,0.09161,-0.239558,-0.77914,-0.382839,0.071597,-0.010783,0.048484,...,-0.025104,0.022102,-0.002472,-0.006835,0.005738,-0.011864,0.01553,0.014993,-0.019,-0.009138
minimum_vacations_offered_(days),0.442254,0.050826,1.0,0.471107,-0.03918,-0.243034,-0.274524,0.162189,-0.194172,0.13353,...,-0.009999,0.003239,-0.023083,-0.006389,-0.010025,0.00322,0.006417,0.003671,-0.010405,-0.003862
vacations_taken_(days),0.25261,0.09161,0.471107,1.0,0.197353,-0.264757,-0.16522,0.574268,-0.059049,0.177828,...,-0.011577,-0.001292,-0.028737,-0.038728,-0.007107,-0.007005,-0.001209,0.01315,-0.02081,-0.005779
unemployment,0.373653,-0.239558,-0.03918,0.197353,1.0,0.169143,-0.131656,0.338419,0.809885,0.317232,...,0.005283,-0.015508,-0.002841,-0.003672,-0.004231,0.000372,0.004088,0.006772,0.007509,0.010238
multiple_jobholders,0.145966,-0.77914,-0.243034,-0.264757,0.169143,1.0,0.599175,-0.042107,0.061969,0.013207,...,0.01736,-0.023697,0.012861,0.008208,0.004996,-0.000759,-0.00255,-0.013957,0.014984,0.008722
inflation,-0.296265,-0.382839,-0.274524,-0.16522,-0.131656,0.599175,1.0,-0.315927,-0.306895,-0.348804,...,0.00163,0.003297,0.015781,0.005773,-0.00048,-0.005037,0.001469,-0.014423,-0.004471,0.004367
paid_parental_leave_(days),0.540334,0.071597,0.162189,0.574268,0.338419,-0.042107,-0.315927,1.0,0.214357,0.699339,...,-0.002568,-0.024177,-0.004769,-0.03571,0.000537,-0.004134,0.004749,0.006184,-0.016971,-0.007772
covid_impact,0.282972,-0.010783,-0.194172,-0.059049,0.809885,0.061969,-0.306895,0.214357,1.0,0.20051,...,0.005908,-0.009828,0.000689,0.013683,-0.00362,-0.00438,0.004615,0.013753,0.01273,0.010615
covid_support,0.336208,0.048484,0.13353,0.177828,0.317232,0.013207,-0.348804,0.699339,0.20051,1.0,...,-0.00156,-0.036986,0.004531,-0.028089,-0.001688,0.009376,0.005625,0.004712,-0.011007,0.0005


In [32]:
cor_spear_merged_z = merged_z.corr(method='spearman')

In [33]:
cor_spear_merged_z

Unnamed: 0,remote_jobs,overworked_population,minimum_vacations_offered_(days),vacations_taken_(days),unemployment,multiple_jobholders,inflation,paid_parental_leave_(days),covid_impact,covid_support,...,previous_heart_problems,medication_use,stress_level,sedentary_hours_per_day,income,bmi,triglycerides,physical_activity_days_per_week,sleep_hours_per_day,heart_attack_risk
remote_jobs,1.0,-0.528809,0.045324,-0.167047,0.322284,0.556464,-0.128558,0.322441,0.45439,0.166321,...,0.005454,-0.021953,0.006719,0.00143,-0.000783,0.014383,-0.002585,-0.015523,0.024163,0.012237
overworked_population,-0.528809,1.0,0.16879,0.259821,-0.054348,-0.842996,-0.505315,0.073826,-0.153029,0.004822,...,-0.025254,0.020227,-0.005874,-0.010552,0.004273,-0.0105,0.013722,0.01613,-0.020895,-0.009381
minimum_vacations_offered_(days),0.045324,0.16879,1.0,0.559315,-0.233924,-0.15851,-0.122573,0.292299,-0.310405,0.189889,...,-0.010624,0.009476,-0.026944,-0.008296,-0.009751,-0.001404,0.005793,0.009417,-0.011913,-0.004947
vacations_taken_(days),-0.167047,0.259821,0.559315,1.0,-0.493757,-0.333912,-0.111248,0.556893,-0.50994,0.224368,...,-0.01439,0.002847,-0.029139,-0.040756,-0.004194,-0.006573,-0.002835,0.012395,-0.020795,-0.007425
unemployment,0.322284,-0.054348,-0.233924,-0.493757,1.0,0.220396,-0.412075,0.05441,0.867998,0.161346,...,0.007816,-0.011969,0.008633,0.016684,0.00093,0.000674,0.007715,0.003741,0.012468,0.007423
multiple_jobholders,0.556464,-0.842996,-0.15851,-0.333912,0.220396,1.0,0.362235,0.04171,0.292444,0.11095,...,0.020595,-0.025984,0.009949,0.010313,0.005377,0.001829,-0.004432,-0.011045,0.021275,0.009095
inflation,-0.128558,-0.505315,-0.122573,-0.111248,-0.412075,0.362235,1.0,-0.574631,-0.22694,-0.543616,...,0.021997,0.008783,-0.003408,0.0223,-0.001376,-0.000366,-0.020664,-0.003647,0.01378,-0.001182
paid_parental_leave_(days),0.322441,0.073826,0.292299,0.556893,0.05441,0.04171,-0.574631,1.0,-0.026753,0.696663,...,-0.008027,-0.024837,-0.008896,-0.038656,0.001502,-0.002364,0.007048,0.002661,-0.015152,-0.005398
covid_impact,0.45439,-0.153029,-0.310405,-0.50994,0.867998,0.292444,-0.22694,-0.026753,1.0,0.018105,...,0.010952,-0.01102,0.007207,0.02241,-0.00336,0.001066,0.001484,0.006734,0.019864,0.012012
covid_support,0.166321,0.004822,0.189889,0.224368,0.161346,0.11095,-0.543616,0.696663,0.018105,1.0,...,-0.004128,-0.036251,-0.00026,-0.030806,-0.006925,0.011473,0.006375,0.000647,-0.013992,0.002924


In [34]:
cor_kend_merged_z = merged_z.corr(method='kendall')

In [35]:
cor_kend_merged_z

Unnamed: 0,remote_jobs,overworked_population,minimum_vacations_offered_(days),vacations_taken_(days),unemployment,multiple_jobholders,inflation,paid_parental_leave_(days),covid_impact,covid_support,...,previous_heart_problems,medication_use,stress_level,sedentary_hours_per_day,income,bmi,triglycerides,physical_activity_days_per_week,sleep_hours_per_day,heart_attack_risk
remote_jobs,1.0,-0.437868,0.015994,-0.152303,0.273321,0.489954,-0.065587,0.196181,0.401635,0.080423,...,0.004783,-0.019254,0.005277,0.001197,-0.000495,0.010363,-0.001858,-0.011826,0.018404,0.010732
overworked_population,-0.437868,1.0,0.163056,0.250977,-0.039599,-0.671034,-0.442897,0.099483,-0.149197,0.038292,...,-0.021228,0.017003,-0.004411,-0.007496,0.002851,-0.007222,0.009482,0.011801,-0.015248,-0.007886
minimum_vacations_offered_(days),0.015994,0.163056,1.0,0.478224,-0.19675,-0.119486,-0.111042,0.245931,-0.268102,0.133897,...,-0.009878,0.008811,-0.021584,-0.00626,-0.007499,-0.001116,0.004367,0.007685,-0.009675,-0.0046
vacations_taken_(days),-0.152303,0.250977,0.478224,1.0,-0.423245,-0.301011,-0.116668,0.484326,-0.453747,0.205132,...,-0.012694,0.002511,-0.022235,-0.029519,-0.003086,-0.004714,-0.00202,0.009556,-0.015925,-0.00655
unemployment,0.273321,-0.039599,-0.19675,-0.423245,1.0,0.178969,-0.296341,0.018017,0.708034,0.09035,...,0.006539,-0.010014,0.006272,0.011498,0.000653,0.000455,0.005304,0.002664,0.009049,0.006211
multiple_jobholders,0.489954,-0.671034,-0.119486,-0.301011,0.178969,1.0,0.352522,-0.03944,0.2662,0.035818,...,0.017937,-0.022631,0.007522,0.007492,0.003909,0.001275,-0.003204,-0.008423,0.01611,0.007921
inflation,-0.065587,-0.442897,-0.111042,-0.116668,-0.296341,0.352522,1.0,-0.547771,-0.110569,-0.389173,...,0.019487,0.00778,-0.00259,0.016228,-0.000959,-0.00016,-0.015061,-0.00286,0.010647,-0.001047
paid_parental_leave_(days),0.196181,0.099483,0.245931,0.484326,0.018017,-0.03944,-0.547771,1.0,-0.084948,0.534952,...,-0.00704,-0.021783,-0.006766,-0.027878,0.000989,-0.001762,0.00525,0.002099,-0.011529,-0.004734
covid_impact,0.401635,-0.149197,-0.268102,-0.453747,0.708034,0.2662,-0.110569,-0.084948,1.0,-0.027571,...,0.009127,-0.009183,0.005433,0.015482,-0.002228,0.000725,0.000945,0.004817,0.0143,0.01001
covid_support,0.080423,0.038292,0.133897,0.205132,0.09035,0.035818,-0.389173,0.534952,-0.027571,1.0,...,-0.00362,-0.031794,-0.000407,-0.022204,-0.005181,0.008321,0.004465,0.000586,-0.010693,0.002565


In [22]:
#merged_df.to_csv('df_heart_wlb_glc', index=False) #

In [10]:
#health_2 = pd.read_csv('estat_hlth_hlye.tsv', delimiter= ' ')

In [11]:
#health_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 20 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   freq,unit,sex,indic_he,geo\TIME_PERIOD	2004  891 non-null    object 
 1   	2005                                        891 non-null    object 
 2   	2006                                        891 non-null    object 
 3   	2007                                        891 non-null    object 
 4   	2008                                        891 non-null    object 
 5   	2009                                        891 non-null    object 
 6   	2010                                        891 non-null    object 
 7   	2011                                        891 non-null    object 
 8   	2012                                        891 non-null    object 
 9   	2013                                        891 non-null    object 
 10  	2