In [1]:
import pandas as pd
from tqdm import tqdm
#from typing import re

In [2]:
input_csv = r'../../biobank/ukb672220.csv'
output_csv = r'filtered_output.csv'

In [3]:
df = pd.read_csv(input_csv, nrows = 0)
len(df.columns)

18433

In [4]:
df.columns

Index(['eid', '3-0.0', '3-1.0', '3-2.0', '3-3.0', '4-0.0', '4-1.0', '4-2.0',
       '4-3.0', '5-0.0',
       ...
       '132596-0.0', '132597-0.0', '132598-0.0', '132599-0.0', '132600-0.0',
       '132601-0.0', '132602-0.0', '132603-0.0', '132604-0.0', '132605-0.0'],
      dtype='object', length=18433)

In [4]:
#              [        , I63 First Date, Age Recruite  , Year Birth, Sex (Gender)  ]
desired_cols = ['eid'   , '131366-0.0'  , '21022-0.0'   , '34-0.0'  , '31-0.0'      ]
col_shoudnt_be_NaN = '131366-0.0'

# Chunk processing and filtering
chunksize = 1000
filtered_chunks = []
print("Starting")

for chunk in tqdm(pd.read_csv(input_csv, 
                              chunksize=chunksize, 
                              usecols=desired_cols, 
                              low_memory=False),
                  desc='Processing', 
                  unit=' chunk'):
    filtered_chunk = chunk[~chunk[col_shoudnt_be_NaN].isna()]
    filtered_chunks.append(filtered_chunk)
    del chunk, filtered_chunk

filtered_df = pd.concat(filtered_chunks, ignore_index=True)

filtered_df.to_csv(output_csv, index=False)

del filtered_df, filtered_chunks

Starting


Processing: 503 chunk [18:32,  2.21s/ chunk]


In [5]:
df = pd.read_csv(output_csv)
df

Unnamed: 0,eid,31-0.0,34-0.0,21022-0.0,131366-0.0
0,1000157,1,1945.0,63.0,2006-07-11
1,1000658,0,1943.0,66.0,2011-11-24
2,1002403,1,1943.0,64.0,1998-02-14
3,1002542,1,1942.0,65.0,2018-04-20
4,1002633,0,1946.0,63.0,2010-08-22
...,...,...,...,...,...
10826,6021252,1,1943.0,65.0,2016-10-28
10827,6021719,1,1952.0,55.0,2003-09-17
10828,6021899,1,1957.0,52.0,2012-02-27
10829,6023669,1,1957.0,52.0,2010-12-05


In [6]:
def calculate_age_at_diag_cerebral_infraction(row):
    birth_year = int(row['34-0.0'])  # Take the year of birth
    date_of_diag_cerebral_infraction = pd.to_datetime(row['131366-0.0'])  # Using the field for date of Cerebral Infraction 
    age_at_diag_cerebral_infraction = date_of_diag_cerebral_infraction.year - birth_year
    return age_at_diag_cerebral_infraction

def check_if_diag_cerebral_infraction_before_recruit(row):
    age_at_recruitment = row['21022-0.0']  # Age at recruitment
    age_at_diag_cerebral_infraction = calculate_age_at_diag_cerebral_infraction(row)
    is_diag_cerebral_infraction_before_recruit = age_at_diag_cerebral_infraction < age_at_recruitment
    return is_diag_cerebral_infraction_before_recruit


In [7]:
df['age_dci'] = df.apply(calculate_age_at_diag_cerebral_infraction, axis=1) #Column represent - Age(age_) diagnose(d) Cerebral(c) infraction(i)
df['diag_ci_before_recruit'] = df.apply(check_if_diag_cerebral_infraction_before_recruit, axis=1)
df

Unnamed: 0,eid,31-0.0,34-0.0,21022-0.0,131366-0.0,age_dci,diag_ci_before_recruit
0,1000157,1,1945.0,63.0,2006-07-11,61,True
1,1000658,0,1943.0,66.0,2011-11-24,68,False
2,1002403,1,1943.0,64.0,1998-02-14,55,True
3,1002542,1,1942.0,65.0,2018-04-20,76,False
4,1002633,0,1946.0,63.0,2010-08-22,64,False
...,...,...,...,...,...,...,...
10826,6021252,1,1943.0,65.0,2016-10-28,73,False
10827,6021719,1,1952.0,55.0,2003-09-17,51,True
10828,6021899,1,1957.0,52.0,2012-02-27,55,False
10829,6023669,1,1957.0,52.0,2010-12-05,53,False


In [8]:
df.to_csv('Cerebral Infraction Data.csv', index=False)

In [2]:
df = pd.read_csv('Cerebral Infraction Data.csv')
df

Unnamed: 0,eid,31-0.0,34-0.0,21022-0.0,131366-0.0,age_dci,diag_ci_before_recruit
0,1000157,1,1945.0,63.0,2006-07-11,61,True
1,1000658,0,1943.0,66.0,2011-11-24,68,False
2,1002403,1,1943.0,64.0,1998-02-14,55,True
3,1002542,1,1942.0,65.0,2018-04-20,76,False
4,1002633,0,1946.0,63.0,2010-08-22,64,False
...,...,...,...,...,...,...,...
10826,6021252,1,1943.0,65.0,2016-10-28,73,False
10827,6021719,1,1952.0,55.0,2003-09-17,51,True
10828,6021899,1,1957.0,52.0,2012-02-27,55,False
10829,6023669,1,1957.0,52.0,2010-12-05,53,False


In [3]:
ages_at_cerebral_infraction =\
    {'0-10' : 0,
        '10-15' : 0,
        '15-20' : 0,
        '21-25' : 0,
        '26-30' : 0,
        '31-35' : 0,
        '36-40' : 0,
        '41-45' : 0,
        '46-50' : 0,
        '51-55' : 0,
        '56-60' : 0,
        '61-65' : 0,
        '66-70' : 0,
        '71-75' : 0,
        '76-80' : 0,
        '81-85' : 0,
        '86-90' : 0,
        'Over90' : 0
        }
for _, row in df.iterrows():
    age = row['age_dci']
    if age < 10:
        ages_at_cerebral_infraction['0-10'] += 1
    elif 10 <= age <= 15:
        ages_at_cerebral_infraction['10-15'] += 1
    elif 15 < age <= 20:
        ages_at_cerebral_infraction['10-20'] += 1
    elif 20 < age <= 25:
        ages_at_cerebral_infraction['21-25'] += 1
    elif 25 < age <= 30:
        ages_at_cerebral_infraction['26-30'] += 1
    elif 30 < age <= 35:
        ages_at_cerebral_infraction['31-35'] += 1
    elif 35 < age <= 40:
        ages_at_cerebral_infraction['36-40'] += 1
    elif 40 < age <= 45:
        ages_at_cerebral_infraction['41-45'] += 1
    elif 45 < age <= 50:
        ages_at_cerebral_infraction['46-50'] += 1
    elif 50 < age <= 55:
        ages_at_cerebral_infraction['51-55'] += 1
    elif 55 < age <= 60:
        ages_at_cerebral_infraction['56-60'] += 1
    elif 60 < age <= 65:
        ages_at_cerebral_infraction['61-65'] += 1
    elif 65 < age <= 70:
        ages_at_cerebral_infraction['66-70'] += 1
    elif 70 < age <= 75:
        ages_at_cerebral_infraction['71-75'] += 1
    elif 75 < age <= 80:
        ages_at_cerebral_infraction['76-80'] += 1
    elif 80 < age <= 85:
        ages_at_cerebral_infraction['81-85'] += 1
    elif 85 < age <= 90:
        ages_at_cerebral_infraction['86-90'] += 1
    elif 90 < age:
        ages_at_cerebral_infraction['Over90'] += 1
    
ages_at_cerebral_infraction

{'0-10': 0,
 '10-15': 1,
 '15-20': 0,
 '21-25': 2,
 '26-30': 5,
 '31-35': 14,
 '36-40': 42,
 '41-45': 138,
 '46-50': 300,
 '51-55': 618,
 '56-60': 977,
 '61-65': 1481,
 '66-70': 2144,
 '71-75': 2641,
 '76-80': 1998,
 '81-85': 470,
 '86-90': 0,
 'Over90': 0}

In [4]:
nof_all_patients = df.shape[0]
nof_women_patients = len(df[df['31-0.0'] == 0]) # Compute the number of women which diagnosed with cerebral infraction
nof_men_patients = len(df[df['31-0.0'] == 1])   # Compute the number of men which diagnosed with cerebral infraction
print("Number of women which diagnosed with cerebral infraction is: ", nof_women_patients)
percentage_women = (nof_women_patients / nof_all_patients) * 100
print(f'Percentage of women: {percentage_women:.2f}%')
print("Number of men which diagnosed with cerebral infraction is: ", nof_men_patients)
percentage_men = (nof_men_patients / nof_all_patients) * 100
print(f'Percentage of women: {percentage_men:.2f}%')

Number of women which diagnosed with cerebral infraction is:  4270
Percentage of women: 39.42%
Number of men which diagnosed with cerebral infraction is:  6561
Percentage of women: 60.58%


Investigate Cerebral Infraction Death Cause Data:

In [5]:
death_cause_df = pd.read_csv(r'../../biobank/death_cause.txt', sep='\t',usecols=['eid', 'cause_icd10'])
death_cause_I63_df = death_cause_df[death_cause_df['cause_icd10'].str.contains("I63", na=False)]
death_cause_I63_df

Unnamed: 0,eid,cause_icd10
87,1003252,I635
115,1004155,I639
445,1019681,I639
716,1029873,I639
1222,1051150,I634
...,...,...
110456,5916506,I639
111607,5967615,I639
112216,5995176,I639
112396,6001688,I639


In [6]:
df['is_alive'] = df['eid'].apply(lambda eid: eid not in death_cause_I63_df['eid'].values)

In [8]:
print("Total amount of death from cerebral infraction is: ", len(df[df['is_alive'] == False]))
print("Amount of Men death from cerebral infraction is: ", len(df[(df['31-0.0'] == 1) & (df['is_alive'] == False)]))
print("Amount of Women death from cerebral infraction is: ", len(df[(df['31-0.0'] == 0) & (df['is_alive'] == False)]))

Total amount of death from cerebral infraction is:  420
Amount of Men death from cerebral infraction is:  259
Amount of Women death from cerebral infraction is:  161


In [9]:
relevant_ci_df = df[df['diag_ci_before_recruit'] == False]
ci_eid_df = pd.DataFrame(relevant_ci_df['eid'])
ci_eid_df.to_csv(r'Cerebral Infraction Diagnosed EIDs.csv', index=False)

In [10]:
ci_eid_df = pd.read_csv(r'Cerebral Infraction Diagnosed EIDs.csv')
ci_eid_df

Unnamed: 0,eid
0,1000658
1,1002542
2,1002633
3,1003228
4,1003252
...,...
9415,6019448
9416,6021252
9417,6021899
9418,6023669
