<a href="https://colab.research.google.com/github/BojanMakivic/Predicting-fitness-level-of-cancer-survivors/blob/master/TF%2C_sklearn_cont.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF, sklearn continuous model

In [None]:
# Python ≥3.5 
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd

# Figures plot
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# GUI to import data file (xlsx)

In [None]:
!pip install openpyxl


In [None]:
import openpyxl

df = pd.read_excel('../input/cycle-ergometry-test-of-cancer-patients/Data.xlsx')
df = pd.DataFrame(df)

In [None]:
df.info() # BW nad BH are standing for body weight and body height, respectively

In [None]:
# Histogram of gender distribution
df['Sex']=np.where(df['Sex']=="W",0,1)
df['Sex'].hist(label=int,align='mid',color=['green'])
df['Level'].value_counts()

## Writing a function for BMI calculation and adding a new column
https://en.wikipedia.org/wiki/Body_mass_index

In [None]:
# Function
def bmi(bw,bh):
    return bw/((bh/100)*(bh/100))

In [None]:
# Adding a new column
df['BMI']=bmi(df.iloc[:,2],df.iloc[:,3])

In [None]:
#df.loc[df['Gender'] == 'W'].describe()
df.describe()

## Histogram distribution of each attribute

In [None]:
df.hist(bins=10, figsize=(15,15))
plt.show()

In [None]:
np.random.seed(42) # To ensure the repeatability of train/test results. It means that every time we run models the train/test samples will be same
del df['Norm']
del df['Type']
del df['Level']
#del df['Gender']
#del df['BW']
#del df['BMI']
#del df['Wmax'] # continuous

In [None]:
df

In [None]:
#df=df.loc[df['Type'] == 'Brustkrebs']
#df=df.loc[df['Gender'] == 'W']
#df

# PREPARING TRAINING AND TEST SET

In [None]:
## CONTINUOUS

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
train_labels = train_set["Wmax"].copy()       # Save a copy of label variable
train_set = train_set.drop("Wmax", axis=1)    # Drop label variable from training set
test_lebels = test_set["Wmax"].copy()
test_set = test_set.drop("Wmax", axis=1)

In [None]:
train_num = train_set

In [None]:
test_num = test_set

# PIPELINE

In [None]:
# Pipline to scale numerical variables and transform categorical variable (gender) into binary

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

pipeline = Pipeline([
        ('std_scaler', MinMaxScaler()),])
train_prepared = pipeline.fit_transform(train_num)

In [None]:
train_prepared # Only numerical variables were scaled (Age, BW, BH and BMI)

In [None]:
test_prepared = pipeline.fit_transform(test_num)

In [None]:
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.compose import ColumnTransformer

#num_attribs = list(train_num) #train_num
#cat_attribs = ["Sex"]

#full_pipeline = ColumnTransformer([
        #("num", pipeline, num_attribs),
        #("cat", OneHotEncoder(), cat_attribs),])

#train_prepared = full_pipeline.fit_transform(train_set) # Training set is ready for use

In [None]:
#train_prepared # Numerical and categorical variables are scaled

In [None]:
#test_prepared = full_pipeline.fit_transform(test_set)
#test_prepared

# SELECTING AND TRAINING THE MODEL

## Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression(n_jobs=1)
lin_reg_fit = lin_reg.fit(train_prepared, train_labels)

In [None]:
# RMSE for linear model
from sklearn.metrics import mean_squared_error

watt_predictions = lin_reg.predict(train_prepared)
lin_mse = mean_squared_error(train_labels, watt_predictions)
lin_rmse = np.sqrt(lin_mse)
print("RMSE is: ",lin_rmse) # We can see that the prediction error for linear model is 32.5 Watt

In [None]:
errors = abs(watt_predictions - train_labels)# Print out the mean absolute error (mae)

print('Mean Absolute Error:', round(np.mean(errors), 2), 'Watt.')

In [None]:
watt_predictions_test = lin_reg.predict(test_prepared)

In [None]:
from sklearn.metrics import r2_score
r2_score(watt_predictions_test, test_lebels)

In [None]:
lin_reg.score(train_prepared, train_labels)

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100/len(train_labels) * sum(errors / train_labels)# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

### Fine-tune linear regression

In [None]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

param_grid = [
    
    {'copy_X': [True],
     'fit_intercept': [True],
     'n_jobs': [1,], 
     'normalize': [False]
  }]

model = LinearRegression()
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search_lr = GridSearchCV(model, param_grid, cv=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search_lr.fit(train_prepared, train_labels)

In [None]:
grid_search_lr.best_params_

In [None]:
grid_search_lr.best_estimator_

In [None]:
negative_mse = grid_search_lr.best_score_
rmse = np.sqrt(-negative_mse)
rmse

## Support vector regressor (SVR)

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="rbf", epsilon=3, C=100, cache_size= 100, degree=0.1, gamma='scale', max_iter = -1, tol =1, coef0= 0.1)
svm_reg_fit = svm_reg.fit(train_prepared, train_labels)

In [None]:
# RMSE for SVR model
from sklearn.metrics import mean_squared_error

watt_SVR_predictions = svm_reg.predict(train_prepared)
SVR_mse = mean_squared_error(train_labels, watt_SVR_predictions)
SVR_rmse = np.sqrt(SVR_mse)
print ("RMSE is: ",SVR_rmse) # We can see that the prediction error for SVR model is 32.6 Wat

In [None]:
errors = abs(watt_SVR_predictions - train_labels)# Print out the mean absolute error (mae)

print('Mean Absolute Error:', round(np.mean(errors), 2), 'Watt.')

In [None]:
from sklearn.metrics import r2_score
watt_predictions_test = svm_reg.predict(test_prepared)
r2_score(watt_predictions_test, test_lebels)

In [None]:
svm_reg.score(train_prepared, train_labels)

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100/len(train_labels) * sum(errors / train_labels)# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

### Fine-tune SVR

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

param_grid = [
    
    {'C': [100],
     'cache_size': [100],
     'coef0': [0.1],
     'degree': [0.1],
     'epsilon': [3],
     'gamma': ['scale'],
     'kernel': ['rbf'],
     'max_iter': [-1],
     'shrinking': [True],
     'tol': [1]
  }]

svm_reg = SVR()
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search_svr = GridSearchCV(svm_reg, param_grid, cv=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search_svr.fit(train_prepared, train_labels)

In [None]:
grid_search_svr.best_params_

In [None]:
negative_mse = grid_search_svr.best_score_
rmse = np.sqrt(-negative_mse)
rmse

## Random forest model

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42, oob_score=True, n_estimators=2000, max_features=2, bootstrap = True, max_depth = 8, min_samples_leaf = 4, min_samples_split = 2)
forest_fit = forest_reg.fit(train_prepared, train_labels)

In [None]:
# RMSE for random forest model

watt_RF_predictions = forest_reg.predict(train_prepared)
forest_mse = mean_squared_error(train_labels, watt_RF_predictions)
forest_rmse = np.sqrt(forest_mse)
print("RMSE is: ", forest_rmse) # We can see that the prediction error for linear model is 13.1 Wat (until now the model with lowest prediction error)

In [None]:
errors = abs(watt_RF_predictions - train_labels)# Print out the mean absolute error (mae)

print('Mean Absolute Error:', round(np.mean(errors), 2), 'Watt.')

In [None]:
forest_reg.score(train_prepared, train_labels)

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100/len(train_labels) * sum(errors / train_labels)# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
np.mean(mape)

### Fine-tune RF

In [None]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

param_grid = [
    {'bootstrap': [True],
     'n_estimators': [2000], 
     'max_features': [2],
     'max_depth': [8],
     'min_samples_leaf': [4],
     'min_samples_split': [2]
  }]

forest_reg = RandomForestRegressor(random_state=42)

grid_search_rf = GridSearchCV(forest_reg, param_grid, cv=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search_rf.fit(train_prepared, train_labels)

In [None]:
grid_search_rf.best_params_

In [None]:
grid_search_rf.best_estimator_

In [None]:
negative_mse = grid_search_rf.best_score_
rmse = np.sqrt(-negative_mse)
rmse

# CROSS-VALIDATION OF MODELS (Training set)

'''The following code randomly splits the training set into 10 distinct subsets called folds, then it trains and evaluates the model 10 times,
picking a different fold for evaluation every time and training on the other 9 folds. The result is an array containing the 10 evaluation scores:'''

In [None]:
# Function to display the score statistics

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
from sklearn.model_selection import cross_val_score

## Linear model

In [None]:
lin_scores = cross_val_score(lin_reg, train_prepared, train_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

## SVR

In [None]:
SVR_scores = cross_val_score(svm_reg, train_prepared, train_labels,
                                scoring="neg_mean_squared_error", cv=10)
SVR_rmse_scores = np.sqrt(-SVR_scores)
display_scores(SVR_rmse_scores)

## Random forest

In [None]:
forest_scores = cross_val_score(grid_search_rf, train_prepared, train_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
# Return the table of cross-validation for all three models

scores = cross_val_score(grid_search_lr,  train_prepared, train_labels, scoring="neg_mean_squared_error", cv=10)
scores_rf = cross_val_score(grid_search_rf,  train_prepared, train_labels, scoring="neg_mean_squared_error", cv=10)
scores_svr = cross_val_score(grid_search_svr,  train_prepared, train_labels, scoring="neg_mean_squared_error", cv=10)
a = pd.Series(np.sqrt(-scores)).describe()
b = pd.Series(np.sqrt(-scores_svr)).describe()
c = pd.Series(np.sqrt(-scores_rf)).describe()

scores_df = pd.DataFrame({
    "lin_reg": a,
    "SVR":b,
    "forest_reg": c
    })
scores_df

# FEATURE IMPORTANCE

In [None]:
feature_importances = grid_search_rf.best_estimator_.feature_importances_
feature_importances

In [None]:
feature_names = list(test_num.columns)

In [None]:
import seaborn as sns

feature_imp = pd.Series(feature_importances,index=feature_names).sort_values(ascending=False)
sns.barplot(x=feature_imp*100, y=feature_imp.index)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Feature Importance")
for index, value in enumerate(feature_imp*100):
    plt.text(value, index, str("{:.0f}".format(value)+" %")) # Rounding the float
plt.show()

# TESTING THE MODEL

In [None]:
final_model = grid_search_lr

X_test = test_set
y_test = test_lebels

X_test_prepared = pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

In [None]:
#from sklearn.model_selection import cross_val_score
#score_final = cross_val_score(lin_reg,  X_test_prepared, y_test, scoring="neg_mean_squared_error", cv=10)
#final = pd.Series(np.sqrt(-score_final)).describe()
#final

In [None]:
from sklearn import metrics
from sklearn.metrics import r2_score

y_oob = final_model.predict(X_test_prepared)
print("R2",r2_score(y_test, y_oob)) # Amount of the variation in the response variable which is predictable from the input independent variable
print("R2",final_model)

In [None]:
from scipy import stats
# 95% confidence interval for the test RMSE
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

In [None]:
errors = abs(final_predictions - y_test)# Print out the mean absolute error (mae)

print('Mean Absolute Error:', round(np.mean(errors), 2), 'Watt.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100/len(y_test) * sum(errors / y_test)# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(mape, 2), '%.')

# PREDICTING Wmax

In [None]:
import warnings
warnings.filterwarnings("ignore")

age = input("Please enter the AGE: ")
age = int(age)
print("Age is: ", age)

bmi = input ("Please ebter the BMI: ")
bmi = int(bmi)
print("BMI is: ", bmi)

bh = input("Please enter the BODY HEIGHT: ")
bh = int(bh)
print("Body height is: ", bh)

bw = input("Please enter the BODY WEIGHT: ")
bw = int(bw)
print("Body weight is: ", bw)

g = input("Please enter your gender(W for women or M for man): ").upper()
print("Gender is: ", g)

a = pd.DataFrame({'Age':[age],'BMI':[bmi],'Gender':[g],'BH':[bh],'BW':[bw]})
b = full_pipeline.transform(a)
print("Your predicted Wmax is: ",int(final_model.predict(b)))