In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/test', './input/train']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

# About the competition
This competition is arranged by Open Source Imaging Consortium (OSIC) - a non-profit organization.

Pulmonary Fibrosis is an incurable lung disease. It occurs when lung tissue becomes damaged and scarred. This affects proper functioning of lungs and infact breathing.

Expectation from the competiton is to predict patient's severity of decline in the lung function based on data provided - CT scan of patient's lungs & allied details like gender, smoking status, FVC. We need to determine lung function based on the output from spirometer, which measures volume of air inhaled and exhaled. The challenge is to use machine learning techniques to make prediction.

If the prediction outcome is successful, it will benefit patients and their families to better understand any decline in lung function in advance and try for better cure or improved health condition.

# 1. Importing the packages

In [1]:
# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])
import numpy as np
# ALEX: remove plotting
# import matplotlib.pyplot as plt
# import plotly.express as px
# import plotly as plty
# import seaborn as sns
# import plotly.graph_objs as go
# from plotly.offline import iplot
# from plotly.subplots import make_subplots
# import plotly.io as pio
# import os
# %matplotlib inline


In [2]:
path = './input/'

In [3]:
df_train = pd.read_csv(f'{path}train.scaled.csv')
df_test = pd.read_csv(f'{path}test.scaled.csv')

# 2. Training Data

2.1 Metadata Information

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1549 entries, 0 to 1548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        1549 non-null   object 
 1   Weeks          1549 non-null   int64  
 2   FVC            1549 non-null   int64  
 3   Percent        1549 non-null   float64
 4   Age            1549 non-null   int64  
 5   Sex            1549 non-null   object 
 6   SmokingStatus  1549 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 84.8+ KB


In [5]:
df_train.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Patient,1549.0,176.0,ID00105637202208831864134,10.0,,,,,,,
Weeks,1549.0,,,,31.861846,23.24755,-5.0,12.0,28.0,47.0,133.0
FVC,1549.0,,,,2690.479019,832.770959,827.0,2109.0,2641.0,3171.0,6399.0
Percent,1549.0,,,,77.672654,19.823261,28.877577,62.8327,75.676937,88.621065,153.145378
Age,1549.0,,,,67.188509,7.057395,49.0,63.0,68.0,72.0,88.0
Sex,1549.0,2.0,Male,1224.0,,,,,,,
SmokingStatus,1549.0,3.0,Ex-smoker,1038.0,,,,,,,


In [6]:
df_train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


In [7]:
df_tmp = df_train.groupby(['Patient', 'Sex'])['SmokingStatus'].unique().reset_index()

In [8]:
df_tmp

Unnamed: 0,Patient,Sex,SmokingStatus
0,ID00007637202177411956430,Male,[Ex-smoker]
1,ID00009637202177434476278,Male,[Ex-smoker]
2,ID00010637202177584971671,Male,[Ex-smoker]
3,ID00011637202177653955184,Male,[Ex-smoker]
4,ID00012637202177665765362,Male,[Never smoked]
...,...,...,...
171,ID00419637202311204720264,Male,[Ex-smoker]
172,ID00421637202311550012437,Male,[Ex-smoker]
173,ID00422637202311677017371,Male,[Ex-smoker]
174,ID00423637202312137826377,Male,[Ex-smoker]


In [9]:
df_tmp['SmokingStatus'] = df_tmp['SmokingStatus'].str[0]
df_tmp['Sex'] = df_tmp['Sex'].str[0]

In [10]:
df_tmp['SmokingStatus'].value_counts()

SmokingStatus
Ex-smoker           118
Never smoked         49
Currently smokes      9
Name: count, dtype: int64

In [11]:
df_tmp['Sex'].value_counts()

Sex
M    139
F     37
Name: count, dtype: int64

In [12]:
# ALEX: remove plotting
# fig, ax = plt.subplots(1,2, figsize = (20,6), sharex=True)
# sns.countplot(x='SmokingStatus',data=df_tmp,ax=ax[0])
# sns.countplot(x='SmokingStatus',hue='Sex', data=df_tmp,ax=ax[1])
# ax[0].title.set_text('Smoking Status')
# ax[1].title.set_text('Smoking Status Vs Sex')
# plt.show()

# What do we have in training dataset (metadata info excluding CT Scan)
* we have 1549 data with no missing values.
* 176 unique patient data is made available  along with data related to their age, gender, smoking status, FVC, weeks
* Age of patients is between 49 and 88. Average age of the patient within the dataset is 67
* We have 139 Male and 37 female patients
* We have 118 Ex-Smoker, 49 Never-Smoked and 9 people who are smoking currently (active)

# 3. Test Data

In [13]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        5 non-null      object 
 1   Weeks          5 non-null      int64  
 2   FVC            5 non-null      int64  
 3   Percent        5 non-null      float64
 4   Age            5 non-null      int64  
 5   Sex            5 non-null      object 
 6   SmokingStatus  5 non-null      object 
dtypes: float64(1), int64(3), object(3)
memory usage: 408.0+ bytes


In [14]:
df_test

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked


We just have 5 patient data available in test set

# Exploration of Data will continue