In [1]:
import pandas as pd
import numpy as np
import pydicom 
import os
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv('sample_submission.csv')

In [3]:
train.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks']) #there are 14 duplicates

In [4]:
submission['Patient'] = (submission.Patient_Week.apply(lambda x: x.split('_')[0]))
submission['Weeks'] = (submission.Patient_Week.apply(lambda x: x.split('_')[1]))

In [5]:
submission =  submission[['Patient','Weeks', 'Confidence','Patient_Week']]

In [6]:
submission = submission.merge(test.drop('Weeks', axis=1), on="Patient")

In [7]:
train['Dataset'] = 'train'
test['Dataset'] = 'test'
submission['Dataset'] = 'submission'

In [8]:
all_data = train.append([test, submission])

all_data = all_data.reset_index()
all_data = all_data.drop(columns=['index'])

In [9]:
train_patients = train.Patient.unique()

In [10]:
all_data['FirstWeek'] = all_data['Weeks']
all_data.loc[all_data.Dataset=='submission','FirstWeek'] = np.nan
all_data['FirstWeek'] = all_data.groupby('Patient')['FirstWeek'].transform('min')

In [11]:
first_fvc = (
    all_data
    .loc[all_data.Weeks == all_data.FirstWeek][['Patient','FVC']]
    .rename({'FVC': 'FirstFVC'}, axis=1)
    .groupby('Patient')
    .first()
    .reset_index()
)

all_data = all_data.merge(first_fvc, on='Patient', how='left')

In [12]:
all_data.Weeks = all_data.Weeks.astype(int)
all_data['WeeksPassed'] = all_data['Weeks'] - all_data['FirstWeek']

In [13]:
def calculate_height(row):
    if row['Sex'] == 'Male':
        return row['FirstFVC'] / (27.63 - 0.112 * row['Age'])
    else:
        return row['FirstFVC'] / (21.78 - 0.101 * row['Age'])

all_data['Height'] = all_data.apply(calculate_height, axis=1)

In [14]:
all_data = pd.concat([
    all_data,
    pd.get_dummies(all_data.Sex),
    pd.get_dummies(all_data.SmokingStatus)
], axis=1)

all_data = all_data.drop(columns=['Sex', 'SmokingStatus'])

In [15]:
def scale_feature(series):
    return (series - series.min()) / (series.max() - series.min())

all_data['Percent'] = scale_feature(all_data['Percent'])
all_data['Age'] = scale_feature(all_data['Age'])
all_data['FirstWeek'] = scale_feature(all_data['FirstWeek'])
all_data['FirstFVC'] = scale_feature(all_data['FirstFVC'])
all_data['WeeksPassed'] = scale_feature(all_data['WeeksPassed'])
all_data['Height'] = scale_feature(all_data['Height'])

In [16]:
feature_columns = ['Percent', 'Age', 'FirstWeek', 'FirstFVC', 'WeeksPassed', 'Height', 'Female', 'Male', 'Currently smokes', 'Ex-smoker', 'Never smoked']

In [17]:
train = all_data.loc[all_data.Dataset == 'train']
test = all_data.loc[all_data.Dataset == 'test']
submission = all_data.loc[all_data.Dataset == 'submission']

In [18]:
train[feature_columns].head()

Unnamed: 0,Percent,Age,FirstWeek,FirstFVC,WeeksPassed,Height,Female,Male,Currently smokes,Ex-smoker,Never smoked
0,0.236393,0.769231,0.011905,0.241456,0.179012,0.211783,0,1,0,1,0
1,0.215941,0.769231,0.011905,0.241456,0.234568,0.211783,0,1,0,1,0
2,0.18496,0.769231,0.011905,0.241456,0.246914,0.211783,0,1,0,1,0
3,0.201767,0.769231,0.011905,0.241456,0.259259,0.211783,0,1,0,1,0
4,0.18658,0.769231,0.011905,0.241456,0.271605,0.211783,0,1,0,1,0


In [19]:
train.FirstWeek.unique()

array([0.01190476, 0.1547619 , 0.05952381, 0.13095238, 0.45238095,
       0.20238095, 0.21428571, 0.27380952, 0.02380952, 0.54761905,
       0.28571429, 0.41666667, 0.19047619, 0.07142857, 0.08333333,
       0.04761905, 0.63095238, 0.25      , 0.33333333, 0.58333333,
       0.10714286, 0.23809524, 0.0952381 , 0.35714286, 0.11904762,
       0.        , 0.16666667, 0.26190476, 0.29761905, 0.69047619,
       0.61904762, 0.17857143, 0.46428571, 0.80952381, 0.03571429,
       0.42857143, 0.22619048, 1.        , 0.14285714, 0.51190476,
       0.47619048, 0.52380952, 0.4047619 , 0.30952381, 0.32142857,
       0.36904762, 0.39285714, 0.3452381 , 0.5       , 0.55952381])

In [20]:
patients = train.Patient.unique().tolist()

In [21]:
for pat in patients:
    print(train[train.Patient == pat].Weeks.min())

-4
8
0
6
33
0
12
13
18
-3
0
41
12
19
30
11
6
1
2
11
-1
48
16
23
44
4
13
15
3
25
-4
5
0
19
-5
9
17
-1
0
44
3
20
44
11
53
15
9
15
4
2
-4
48
47
10
18
44
34
0
11
63
6
-2
2
31
3
14
16
16
44
8
79
7
7
12
38
35
11
9
39
29
8
4
4
9
0
7
20
33
0
6
3
10
20
0
11
-1
0
13
14
23
9
-3
3
34
8
2
4
14
34
21
1
13
3
4
48
6
8
0
4
21
4
2
5
5
4
22
8
3
3
0
26
12
3
26
39
28
3
6
29
0
-1
28
23
23
0
14
24
4
3
37
35
35
7
12
6
39
25
5
48
0
5
0
11
29
42
0
21
13
7
7
8
6
15
6
17
0


In [22]:
initial_data = pd.DataFrame(index = patients, columns = feature_columns)

In [23]:
for ind in initial_data.index:
    for col in feature_columns:
        initial_data.loc[ind, col] = train[train.Patient == ind][col].tolist()[0]

In [24]:
initial_data

Unnamed: 0,Percent,Age,FirstWeek,FirstFVC,WeeksPassed,Height,Female,Male,Currently smokes,Ex-smoker,Never smoked
ID00007637202177411956430,0.236393,0.769231,0.0119048,0.241456,0.179012,0.211783,0,1,0,1,0
ID00009637202177434476278,0.453901,0.512821,0.154762,0.49127,0.179012,0.448506,0,1,0,1,0
ID00010637202177584971671,0.529881,0.282051,0.0595238,0.465825,0.179012,0.388326,0,1,0,1,0
ID00011637202177653955184,0.459572,0.589744,0.130952,0.429235,0.179012,0.3942,0,1,0,1,0
ID00012637202177665765362,0.521844,0.410256,0.452381,0.446322,0.179012,0.386284,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
ID00419637202311204720264,0.332421,0.615385,0.130952,0.3724,0.179012,0.336622,0,1,0,1,0
ID00421637202311550012437,0.427848,0.487179,0.238095,0.320208,0.179012,0.264862,0,1,0,1,0
ID00422637202311677017371,0.384612,0.615385,0.130952,0.169948,0.179012,0.117914,0,1,0,1,0
ID00423637202312137826377,0.405425,0.589744,0.261905,0.423291,0.179012,0.387816,0,1,0,1,0


In [26]:
test_original = pd.read_csv("test.csv")

In [27]:
test_original

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked
