In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
train_df = pd.read_csv("C:/Users/u1363606/Downloads/OSIC/train.csv")
test_df = pd.read_csv("C:/Users/u1363606/Downloads/OSIC/test.csv")

In [3]:
print("Training data shape is :",train_df.shape[0])
print("Testing data shape is :",test_df.shape[0])

Training data shape is : 1549
Testing data shape is : 5


# Data processing 

In [4]:
#training set
#Encoding Categorical Features:sex and smokingstatus
df_category = OrdinalEncoder().fit_transform(train_df[['Sex', 'SmokingStatus']]) 
df_category = pd.DataFrame({'Sex': df_category[:, 0], 'SmokingStatus': df_category[:, 1]})

#Numerical features: weeks, percent, and age
df_numerical = StandardScaler().fit_transform(train_df[['Weeks', 'Percent','Age']])  
df_numerical= pd.DataFrame({'Weeks': df_numerical[:, 0], 'Percent': df_numerical[:, 1],'Age':df_numerical[:,2]})


# testing set
#Encoding Categorical Features:sex and smokingstatus
df1_category = OrdinalEncoder().fit_transform(test_df[['Sex', 'SmokingStatus']]) 
df1_category = pd.DataFrame({'Sex': df1_category[:, 0], 'SmokingStatus': df1_category[:, 1]})

#Numerical features: weeks, percent, and age
df1_numerical = StandardScaler().fit_transform(test_df[['Weeks', 'Percent','Age']])  
df1_numerical= pd.DataFrame({'Weeks': df1_numerical[:, 0], 'Percent': df1_numerical[:, 1],'Age':df1_numerical[:,2]})

In [5]:
df = pd.concat([df_category, df_numerical, train_df['FVC']], axis = 1)
df

Unnamed: 0,Sex,SmokingStatus,Weeks,Percent,Age,FVC
0,1.0,1.0,-1.543106,-0.979923,1.674174,2315
1,1.0,1.0,-1.155843,-1.108174,1.674174,2214
2,1.0,1.0,-1.069785,-1.302454,1.674174,2061
3,1.0,1.0,-0.983726,-1.197060,1.674174,2144
4,1.0,1.0,-0.897668,-1.292296,1.674174,2069
...,...,...,...,...,...,...
1544,1.0,2.0,-0.811610,-0.559020,0.823727,2712
1545,1.0,2.0,-0.553435,-0.229413,0.823727,2978
1546,1.0,2.0,-0.037085,-0.316151,0.823727,2908
1547,1.0,2.0,0.479266,-0.233130,0.823727,2975


splitting data into training and testing set

In [6]:
X = df.drop('FVC',axis =1)
y = df['FVC']

In [7]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

# Model

In [8]:
def calcualte_metric_loss(pred_y,true_y,pred_sigma):
        sigma_clipped=np.clip(pred_sigma,a_min=70,a_max=1000)
        delta=np.clip(np.abs(pred_y-true_y),a_min=70,a_max=1000)
        metric=np.divide(-np.sqrt(np.array([2.0]))*delta,sigma_clipped)-np.log(np.sqrt(np.array([2.0]))*sigma_clipped)
        return metric

In [23]:
def calcualte_metric_loss(y_true, y_pred, y_pred_std):
    delta = np.clip(abs(y_true - y_pred), 0, 1000)
    std_clipped = np.clip(y_pred_std, 70, np.inf)
    return np.mean(-(np.sqrt(2)*delta/std_clipped) - np.log(np.sqrt(2)*std_clipped))

Lasso

In [9]:
from sklearn.linear_model import Lasso

In [10]:
la =Lasso()
la.fit(X_train, y_train)
y_pred = la.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("The mean squared error (MSE) on test set: {}".format(mse))

The mean squared error (MSE) on test set: 119385.98019042597


In [11]:
Laso_metric=calcualte_metric_loss(y_pred,y_test,100)
np.mean(Laso_metric)

-8.802408666877405

RandomForestRegressor

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
rf =RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("The mean squared error (MSE) on test set: {}".format(mse))

The mean squared error (MSE) on test set: 44207.8847048387


In [14]:
rf_metric=calcualte_metric_loss(y_pred,y_test,100)
np.mean(rf_metric)

-7.214973607842983

Gradient Boosting Regressor

In [15]:
params = {'n_estimators': 300,'max_depth': 5, 'learning_rate': 0.01}
gb = GradientBoostingRegressor(**params)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("The mean squared error (MSE) on test set: {}".format(mse))

The mean squared error (MSE) on test set: 46711.85622224618


In [16]:
gbr_metric=calcualte_metric_loss(y_pred,y_test,100)
np.mean(gbr_metric)

-7.444640665311055

XGB Regressor

In [17]:
from xgboost import XGBRegressor
params = {'n_estimators': 300, 'max_depth': 5,'learning_rate': 0.01}
xgb = XGBRegressor(**params)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("The mean squared error (MSE) on test set: {}".format(mse))

The mean squared error (MSE) on test set: 70186.32159737773


In [18]:
xgb_metric=calcualte_metric_loss(y_pred,y_test,100)
np.mean(xgb_metric)

-7.9506582574991125

#final submission

In [19]:
df_test = pd.concat([df1_category, df1_numerical], axis = 1)
df_test

Unnamed: 0,Sex,SmokingStatus,Weeks,Percent,Age
0,0.0,0.0,-0.444053,-1.306936,0.618853
1,0.0,0.0,0.98326,1.360182,-1.9597
2,0.0,0.0,-0.444053,0.151769,0.618853
3,0.0,0.0,1.300441,0.733487,0.103142
4,0.0,1.0,-1.395595,-0.938503,0.618853


In [20]:
y_pred_score = gb.predict(df_test)
y_pred_score

array([1186.34422768, 2982.47918531, 1774.70008256, 1948.60762207,
       1520.43506048])

In [21]:
pred = pd.DataFrame(y_pred_score,columns = ['FVC'])
pred['Confidence'] = pred['FVC'].std()

In [22]:
pred

Unnamed: 0,FVC,Confidence
0,1186.344228,678.540553
1,2982.479185,678.540553
2,1774.700083,678.540553
3,1948.607622,678.540553
4,1520.43506,678.540553
