In [1]:
import pandas as pd
import numpy as np
import pydicom 
import os
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv('sample_submission.csv')

In [3]:
train.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks']) #there are 14 duplicates

In [4]:
submission['Patient'] = (submission.Patient_Week.apply(lambda x: x.split('_')[0]))
submission['Weeks'] = (submission.Patient_Week.apply(lambda x: x.split('_')[1]))

In [5]:
submission =  submission[['Patient','Weeks', 'Confidence','Patient_Week']]

In [6]:
submission = submission.merge(test.drop('Weeks', axis=1), on="Patient")

In [7]:
submission.head()

Unnamed: 0,Patient,Weeks,Confidence,Patient_Week,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,-12,100,ID00419637202311204720264_-12,3020,70.186855,73,Male,Ex-smoker
1,ID00419637202311204720264,-11,100,ID00419637202311204720264_-11,3020,70.186855,73,Male,Ex-smoker
2,ID00419637202311204720264,-10,100,ID00419637202311204720264_-10,3020,70.186855,73,Male,Ex-smoker
3,ID00419637202311204720264,-9,100,ID00419637202311204720264_-9,3020,70.186855,73,Male,Ex-smoker
4,ID00419637202311204720264,-8,100,ID00419637202311204720264_-8,3020,70.186855,73,Male,Ex-smoker


In [8]:
train['Dataset'] = 'train'
test['Dataset'] = 'test'
submission['Dataset'] = 'submission'

In [9]:
all_data = train.append([test, submission])

all_data = all_data.reset_index()
all_data = all_data.drop(columns=['index'])

In [10]:
train_patients = train.Patient.unique()

In [11]:
all_data['FirstWeek'] = all_data['Weeks']
all_data.loc[all_data.Dataset=='submission','FirstWeek'] = np.nan
all_data['FirstWeek'] = all_data.groupby('Patient')['FirstWeek'].transform('min')

In [12]:
first_fvc = (
    all_data
    .loc[all_data.Weeks == all_data.FirstWeek][['Patient','FVC']]
    .rename({'FVC': 'FirstFVC'}, axis=1)
    .groupby('Patient')
    .first()
    .reset_index()
)

all_data = all_data.merge(first_fvc, on='Patient', how='left')

In [13]:
all_data.Weeks = all_data.Weeks.astype(int)
all_data['WeeksPassed'] = all_data['Weeks'] - all_data['FirstWeek']

In [14]:
def calculate_height(row):
    if row['Sex'] == 'Male':
        return row['FirstFVC'] / (27.63 - 0.112 * row['Age'])
    else:
        return row['FirstFVC'] / (21.78 - 0.101 * row['Age'])

all_data['Height'] = all_data.apply(calculate_height, axis=1)

In [15]:
all_data = pd.concat([
    all_data,
    pd.get_dummies(all_data.Sex),
    pd.get_dummies(all_data.SmokingStatus)
], axis=1)

all_data = all_data.drop(columns=['Sex', 'SmokingStatus'])

In [16]:
def scale_feature(series):
    return (series - series.min()) / (series.max() - series.min())

all_data['Percent'] = scale_feature(all_data['Percent'])
all_data['Age'] = scale_feature(all_data['Age'])
all_data['FirstWeek'] = scale_feature(all_data['FirstWeek'])
all_data['FirstFVC'] = scale_feature(all_data['FirstFVC'])
all_data['WeeksPassed'] = scale_feature(all_data['WeeksPassed'])
all_data['Height'] = scale_feature(all_data['Height'])

In [17]:
feature_columns = ['Percent', 'Age', 'FirstWeek', 'FirstFVC', 'WeeksPassed', 'Height', 'Female', 'Male', 'Currently smokes', 'Ex-smoker', 'Never smoked']

In [18]:
train = all_data.loc[all_data.Dataset == 'train']
test = all_data.loc[all_data.Dataset == 'test']
submission = all_data.loc[all_data.Dataset == 'submission']

In [19]:
train[feature_columns].head()

Unnamed: 0,Percent,Age,FirstWeek,FirstFVC,WeeksPassed,Height,Female,Male,Currently smokes,Ex-smoker,Never smoked
0,0.236393,0.769231,0.011905,0.241456,0.179012,0.211783,0,1,0,1,0
1,0.215941,0.769231,0.011905,0.241456,0.234568,0.211783,0,1,0,1,0
2,0.18496,0.769231,0.011905,0.241456,0.246914,0.211783,0,1,0,1,0
3,0.201767,0.769231,0.011905,0.241456,0.259259,0.211783,0,1,0,1,0
4,0.18658,0.769231,0.011905,0.241456,0.271605,0.211783,0,1,0,1,0
