In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error,mean_absolute_error
from sklearn.model_selection import LeaveOneOut, KFold , train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

### Performing Analysis for FEV1 Data

In [2]:
# Loading npy data.
X = np.load("/VS dir/PlayingSpiroData/Spiro-Data/npy_file/FEV1_FEATURES_60.npy")
y = np.load("/VS dir/PlayingSpiroData/Spiro-Data/npy_file/FEV1_LABELS_60.npy")

# Removing erroreneous data

X= pd.DataFrame(X).drop(index=[23,55,4,9,52,44,45,33,43,20,1,50])
y= pd.DataFrame(y).drop(index=[23,55,4,9,52,44,45,33,43,20,1,50])

### Initializing Decision tree and identifying feature importance.

In [3]:
# Instantiating Decision Tree Regressor.
DT = DecisionTreeRegressor( criterion='squared_error',min_samples_leaf=1, 
                                min_samples_split=5,random_state=42)

X_work,X_eval,y_work,y_eval = train_test_split(X,y,test_size=0.5,shuffle=True,random_state=42)

print("Eval indexes are : ",list(X_eval.index.sort_values()))
print("work indexes are : ",list(X_work.index.sort_values()))

DT.fit(X_eval,y_eval)

DT_imp = pd.DataFrame(DT.feature_importances_)
feature_index = list(DT_imp[DT_imp[0]>0].index)

print("Inportant feature obtained",feature_index)


Eval indexes are :  [5, 6, 8, 11, 12, 15, 16, 18, 19, 21, 24, 29, 30, 31, 32, 35, 38, 39, 46, 48, 49, 54, 57, 58]
work indexes are :  [0, 2, 3, 7, 10, 13, 14, 17, 22, 25, 26, 27, 28, 34, 36, 37, 40, 41, 42, 47, 51, 53, 56, 59]
Inportant feature obtained [0, 1, 3, 6, 8, 23, 77, 81, 97, 111]


### Retraining Random Forest using Leave one out Cross Validation using new features and using entire work-set.

In [105]:
X_new = X_work.iloc[:,feature_index]
y_new = y_work

# X_train,X_test,y_train,y_test = train_test_split(X_new,y_new,test_size=0.5,shuffle=True,random_state=42)

RF_new = RandomForestRegressor( n_jobs=-1, bootstrap=True, criterion='squared_error', 
                                  n_estimators=500,  max_features='sqrt', max_depth=100,  
                                  min_samples_leaf=1, min_samples_split=5 ,random_state=42)

# Total number of the splits
tot = len(X_new)
# Instantiating Leave_One_Out split function.
loo = LeaveOneOut()

prog = 0
y_GT = []
y_PT = []
abserror=[]

print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=")
print("Random Forest Regressor for FEV1")

# For every split obtained by Leave_One_Out split function.
for i,(train_index, test_index) in enumerate(loo.split(X_new)):
    
    # To show some sort of progress.
    prog = prog + 1
    print("Progress : {0}/{1}".format(prog,tot),end = '\r')

    # Test-train split for the fold.
    X_Tr, X_T = X_new.iloc[train_index],X_new.iloc[test_index]
    y_Tr, y_T = y_new.iloc[train_index],y_new.iloc[test_index]
    
    # Training new Random Forest ensemble.
    RF_new.fit(X_Tr, np.ravel(y_Tr))
    pred = RF_new.predict(X_T)  

    # Storing the values of Ground truth and Predicted value for future use.
    y_GT.append(y_T.iloc[0,0])
    y_PT.append(pred[0]) 


print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
print("Random forest metrics with reduced feature set")
print("\tMean Absolute Percentage Error : " , 100 * mean_absolute_percentage_error(y_GT,y_PT))
print("\tMean Aabsolute Error : " , mean_absolute_error(y_GT,y_PT))
print("\tMean Square Error : " , mean_squared_error(y_GT,y_PT))
print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")

=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=
Random Forest Regressor for FEV1
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Random forest metrics with reduced feature set
	Mean Absolute Percentage Error :  8.562313597380369
	Mean Aabsolute Error :  0.25345118433418395
	Mean Square Error :  0.11717913390428775
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=


In [13]:
X_new = X_work.iloc[:,feature_index]
y_new = y_work

X_train,X_test,y_train,y_test = train_test_split(X_new,y_new,train_size=0.8,shuffle=True,random_state=42)

RF_new = RandomForestRegressor( n_jobs=-1, bootstrap=True, criterion='squared_error', 
                                  n_estimators=500,  max_features='sqrt', max_depth=100,  
                                  min_samples_leaf=1, min_samples_split=5 ,random_state=42)


RF_new.fit(X_train,np.ravel(y_train))
y_hat = RF_new.predict(X_test)


print("Random Forest Regressor for FEV1 on the 70-30 holdout set")
print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
print("Random forest metrics with reduced feature set")
print("\tMean Absolute Percentage Error : " , 100 * mean_absolute_percentage_error(y_test,y_hat))
print("\tMean Aabsolute Error : " , mean_absolute_error(y_test,y_hat))
print("\tMean Square Error : " , mean_squared_error(y_test,y_hat))
print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n")

Random Forest Regressor for FEV1 on the 70-30 holdout set
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Random forest metrics with reduced feature set
	Mean Absolute Percentage Error :  4.714133809140775
	Mean Aabsolute Error :  0.12617502857142612
	Mean Square Error :  0.04282827381633935
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=



In [15]:
y_hat = RF_new.predict(X.iloc[:,feature_index])

print("Random Forest Regressor for FEV1 on the entire y set")
print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
print("Random forest metrics with reduced feature set")
print("\tMean Absolute Percentage Error : " , 100 * mean_absolute_percentage_error(y,y_hat))
print("\tMean Aabsolute Error : " , mean_absolute_error(y,y_hat))
print("\tMean Square Error : " , mean_squared_error(y,y_hat))
print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n")

Random Forest Regressor for FEV1 on the entire y set
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Random forest metrics with reduced feature set
	Mean Absolute Percentage Error :  4.760528542193643
	Mean Aabsolute Error :  0.14218146957671943
	Mean Square Error :  0.03381261279138157
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=



### Retraining Random Forest using Leave one out Cross Validation using new features.

In [12]:
X_new = X_work.iloc[:,feature_index]
y_new = y_work

X_train,X_test,y_train,y_test = train_test_split(X_new,y_new,test_size=0.3,shuffle=True,random_state=42)

RF_new = RandomForestRegressor( n_jobs=-1, bootstrap=True, criterion='squared_error', 
                                  n_estimators=500,  max_features='sqrt', max_depth=100,  
                                  min_samples_leaf=1, min_samples_split=5 ,random_state=42)

# Total number of the splits
tot = len(X_train)
# Instantiating Leave_One_Out split function.
loo = LeaveOneOut()

prog = 0
y_GT = []
y_PT = []
abserror=[]

print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=")
print("Random Forest Regressor for FEV1 using LOOCV")

# For every split obtained by Leave_One_Out split function.
for i,(train_index, test_index) in enumerate(loo.split(X_train)):
    
    # To show some sort of progress.
    prog = prog + 1
    print("Progress : {0}/{1}".format(prog,tot),end = '\r')

    # Test-train split for the fold.
    X_Tr, X_T = X_train.iloc[train_index],X_train.iloc[test_index]
    y_Tr, y_T = y_train.iloc[train_index],y_train.iloc[test_index]
    
    # Training new Random Forest ensemble.
    RF_new.fit(X_Tr, np.ravel(y_Tr))
    pred = RF_new.predict(X_T)  

    # Storing the values of Ground truth and Predicted value for future use.
    y_GT.append(y_T.iloc[0,0])
    y_PT.append(pred[0]) 


print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
print("Random forest metrics with reduced feature set")
print("\tMean Absolute Percentage Error : " , 100 * mean_absolute_percentage_error(y_GT,y_PT))
print("\tMean Aabsolute Error : " , mean_absolute_error(y_GT,y_PT))
print("\tMean Square Error : " , mean_squared_error(y_GT,y_PT))
print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n")


print("Random Forest Regressor for FEV1 on the holdout set")
RF_new.fit(X_train, np.ravel(y_train))
y_hat = RF_new.predict(X_test)
print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
print("Random forest metrics with reduced feature set")
print("\tMean Absolute Percentage Error : " , 100 * mean_absolute_percentage_error(y_test,y_hat))
print("\tMean Aabsolute Error : " , mean_absolute_error(y_test,y_hat))
print("\tMean Square Error : " , mean_squared_error(y_test,y_hat))
print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n")

=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=
Random Forest Regressor for FEV1 using LOOCV
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Random forest metrics with reduced feature set
	Mean Absolute Percentage Error :  12.381338887059174
	Mean Aabsolute Error :  0.36560647091450205
	Mean Square Error :  0.17860589558381984
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

Random Forest Regressor for FEV1 on the holdout set
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Random forest metrics with reduced feature set
	Mean Absolute Percentage Error :  5.107618940365664
	Mean Aabsolute Error :  0.1405352602813848
	Mean Square Error :  0.035503002116922366
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

