In [1]:
import pandas as pd
import numpy as np
import pydicom 
import os
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv('sample_submission.csv')

In [3]:
train.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks']) #there are 14 duplicates

In [4]:
patients = train.Patient.unique()

In [5]:
train['FirstWeek'] = train['Weeks']
train['FirstWeek'] = train.groupby('Patient')['FirstWeek'].transform('min')

In [6]:
train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FirstWeek
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,-4
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,-4
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,-4
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,-4
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,-4


In [7]:
first_fvc = (
    train
    .loc[train.Weeks == train.FirstWeek][['Patient','FVC']]
    .rename({'FVC': 'FirstFVC'}, axis=1)
    .groupby('Patient')
    .first()
    .reset_index()
)

train = train.merge(first_fvc, on='Patient', how='left')

In [8]:
train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FirstWeek,FirstFVC
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,-4,2315
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,-4,2315
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,-4,2315
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,-4,2315
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,-4,2315


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1535 entries, 0 to 1534
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        1535 non-null   object 
 1   Weeks          1535 non-null   int64  
 2   FVC            1535 non-null   int64  
 3   Percent        1535 non-null   float64
 4   Age            1535 non-null   int64  
 5   Sex            1535 non-null   object 
 6   SmokingStatus  1535 non-null   object 
 7   FirstWeek      1535 non-null   int64  
 8   FirstFVC       1535 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 119.9+ KB


In [10]:
def calculate_height(row):
    if row['Sex'] == 'Male':
        return row['FirstFVC'] / (27.63 - 0.112 * row['Age'])
    else:
        return row['FirstFVC'] / (21.78 - 0.101 * row['Age'])

train['Height'] = train.apply(calculate_height, axis=1)

In [11]:
train = pd.concat([
    train,
    pd.get_dummies(train.Sex, drop_first = True),
    pd.get_dummies(train.SmokingStatus, drop_first = True)
], axis=1)

train = train.drop(columns=['Sex', 'SmokingStatus'])

In [12]:
train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,FirstWeek,FirstFVC,Height,Male,Ex-smoker,Never smoked
0,ID00007637202177411956430,-4,2315,58.253649,79,-4,2315,123.256309,1,1,0
1,ID00007637202177411956430,5,2214,55.712129,79,-4,2315,123.256309,1,1,0
2,ID00007637202177411956430,7,2061,51.862104,79,-4,2315,123.256309,1,1,0
3,ID00007637202177411956430,9,2144,53.950679,79,-4,2315,123.256309,1,1,0
4,ID00007637202177411956430,11,2069,52.063412,79,-4,2315,123.256309,1,1,0


In [13]:
# def scale_feature(series):
#     return (series - series.min()) / (series.max() - series.min())

# all_data['Percent'] = scale_feature(all_data['Percent'])
# all_data['Age'] = scale_feature(all_data['Age'])
# all_data['FirstWeek'] = scale_feature(all_data['FirstWeek'])
# all_data['FirstFVC'] = scale_feature(all_data['FirstFVC'])
# all_data['WeeksPassed'] = scale_feature(all_data['WeeksPassed'])
# all_data['Height'] = scale_feature(all_data['Height'])

In [14]:
feature_columns = ['Percent', 'Age', 'FirstWeek', 'FirstFVC', 'Height', 'Male', 'Ex-smoker', 'Never smoked']

In [15]:
train.loc[train.Weeks == train.FirstWeek][['Patient','Percent', 'Age', 'FirstWeek', 'FirstFVC', 'Height', 'Male', 'Ex-smoker', 'Never smoked']].groupby('Patient').first().to_csv('features.csv')

In [16]:
features = pd.read_csv('features.csv')
features

Unnamed: 0,Patient,Percent,Age,FirstWeek,FirstFVC,Height,Male,Ex-smoker,Never smoked
0,ID00007637202177411956430,58.253649,79,-4,2315,123.256309,1,1,0
1,ID00009637202177434476278,85.282878,69,8,3660,183.901115,1,1,0
2,ID00010637202177584971671,94.724672,60,0,3523,168.483979,1,1,0
3,ID00011637202177653955184,85.987590,72,6,3326,169.988756,1,1,0
4,ID00012637202177665765362,93.726006,65,33,3418,167.960688,1,0,1
...,...,...,...,...,...,...,...,...,...
171,ID00419637202311204720264,70.186855,73,6,3020,155.237997,1,1,0
172,ID00421637202311550012437,82.045291,68,15,2739,136.854202,1,1,0
173,ID00422637202311677017371,76.672493,73,6,1930,99.208389,1,1,0
174,ID00423637202312137826377,79.258903,72,17,3294,168.353266,1,1,0


In [17]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Patient       176 non-null    object 
 1   Percent       176 non-null    float64
 2   Age           176 non-null    int64  
 3   FirstWeek     176 non-null    int64  
 4   FirstFVC      176 non-null    int64  
 5   Height        176 non-null    float64
 6   Male          176 non-null    int64  
 7   Ex-smoker     176 non-null    int64  
 8   Never smoked  176 non-null    int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 12.5+ KB
