In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import dataset
df = pd.read_csv("bodyfat.csv")
df.head()


FileNotFoundError: ignored

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# define a distribution function
def plot_displots(col):

    plt.figure(figsize=(12, 5))
    sns.kdeplot(df["BodyFat"], color="magenta",
                label="Bodyfat")
    sns.kdeplot(df[col], color="red",
                label=col)
    plt.legend();
    plt.show()

cols =list(df.columns)
for i in cols:
    print(f"Distribution plots for {i} feature is shown below")
    plot_displots(i);
    print("."*100);

In [None]:
# function that plots the distribution
def draw_plots(df, col):

    plt.figure(figsize=(12,5))
    plt.subplot(1, 3, 1)
    plt.hist(df[col], color="magenta")

    plt.subplot(1, 3, 2)
    stats.probplot(df[col], dist="norm", plot=plt)

    plt.subplot(1, 3, 3)
    sns.boxplot(df[col], color="magenta")

    plt.show()

cols = list(df.columns)
for i in range(len(cols)):

    print(f"Distribution plots for the feature {cols[i]} are shown below ")

    draw_plots(df, cols[i])

    print("="*100)

In [None]:
#Checking for outliers
upper_limit = []
lower_limit = []
for i in df.columns:
    upper_limit.append(df[i].mean() + (df[i].std())*4)
    lower_limit.append(df[i].mean() - (df[i].std())*4)

In [None]:
cols = list(df.columns)
j = 0
for i in range(len(cols)):

    temp = df.loc[(df[cols[i]]>upper_limit[j])&(df[cols[i]]<lower_limit[j])]
    j += 1

In [None]:
temp

In [None]:
# We will create a copy of the data frame
data = df.copy()

# get the target label
y = data["BodyFat"]

# get the predictive varable
X = data.drop(columns=["BodyFat"])

In [None]:
# import ExtraTrees Regressor
from sklearn.ensemble import ExtraTreesRegressor

# instantiate ExtrareeRegressor
er = ExtraTreesRegressor()

# Fit the Features and target labels
er.fit(X, y)

In [None]:
# get the feature importances into a pandas series
series = pd.Series(er.feature_importances_, index=X.columns)
series


In [None]:
# plot a graph of the top 5 feature importance scores
series.nlargest(5).plot(kind="barh", color="green");

In [None]:
# import mutual info regression
from sklearn.feature_selection import mutual_info_regression

# instanciate mutual_info_regression
mr = mutual_info_regression(X, y)

In [None]:
plot_data = pd.Series(mr, index=X.columns)
plot_data.nlargest(5).plot(kind="barh", color="green")

In [None]:
#
data

In [None]:
# Plot correlation map
plt.figure(figsize=(10, 7))
sns.heatmap(data.corr(), annot=True, cmap="plasma");

In [None]:
# defin a correlation threshold function function
def correlation(df, threshold):

    col_cor = set()

    cor_mat = df.corr()

    for i in range(len(cor_mat)):

         for j in range(i):

                '''
                for each cell, get the value of that cell by
                .loc[i][j], where i is th row and j is the col.
                If the absolute value is greater that the threshold,
                get the colum_name and add it in the set
                '''

                if abs(cor_mat.iloc[i][j]) > threshold:
                    col_name = cor_mat.columns[i]
                    col_cor.add(col_name)

    return col_cor

ans = correlation(X, threshold=0.85)

ans

In [None]:
X.corr()[['Abdomen', 'Chest', 'Hip', 'Knee', 'Thigh']]

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# copy data
temp = data[list(data.columns)]
info = pd.DataFrame()

# Get varianve importance score
info["VIF"] = [variance_inflation_factor(temp.values, i) for i in range(temp.shape[1])]
info["Columns"] = temp.columns
info

In [None]:
info.sort_values(by="VIF", ascending=False)

In [None]:
col_1 = list(series.nlargest(5).index)
col_2 = list(plot_data.nlargest(5).index)

col_1, col_2

In [None]:
to_train = X[col_1]
to_train.head()

In [None]:
# Import modules to build model
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import metrics
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# Split the data set
X_train, X_test, y_train, y_test = train_test_split(to_train, y,test_size=0.2)

In [None]:
X_train.shape, X_test.shape

In [None]:
# Build a decision tree model
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)

In [None]:
# Plot tree
plt.figure(figsize=(10, 7))
tree.plot_tree(reg, filled=True);

In [None]:
# prune the decision tree
path = reg.cost_complexity_pruning_path(X_train, y_train)
ccp_alpha = path.ccp_alphas

In [None]:
# train different models with different ccp_alpha values
alpha_list = []
for i in range(len(ccp_alpha)):
    reg = DecisionTreeRegressor(ccp_alpha=ccp_alpha[i])
    reg.fit(X_train, y_train)
    alpha_list.append(reg)

In [None]:
# get the train and test scores
train_score = [alpha_list[i].score(X_train, y_train) for i in range(len(alpha_list))]

test_score = [alpha_list[i].score(X_test, y_test) for i in range(len(alpha_list))]

In [None]:
# Plot the train and test scores
plt.xlabel("alpha")
plt.ylabel("accuracy")
plt.plot(ccp_alpha, train_score, marker="o", label="training",
        color="blue", drawstyle="steps-post")
plt.plot(ccp_alpha, test_score, marker="o", label="testing",
        color="red", drawstyle="steps-post");
plt.legend();

In [None]:
# Normal approach
clf = DecisionTreeRegressor(ccp_alpha=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"Decision Tree Base Model: {metrics.r2_score(y_test, y_pred)}")

In [None]:
# Random Forest
rf_base = RandomForestRegressor(n_estimators=1000, ccp_alpha=1)
rf_base.fit(np.array(X_train), y_train)
y_pred_rf = rf_base.predict(np.array(X_test))
print(f"Random Forest Base Model: {metrics.r2_score(y_test, y_pred_rf)} ")

In [None]:
# Gradient boosting
gb_base = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1)
gb_base.fit(np.array(X_train), y_train)
y_pred_gb = gb_base.predict(np.array(X_test))
gb_r2 = metrics.r2_score(y_test, y_pred_gb)
print(f"Gradient Boosting Base Model: {gb_r2}")

In [None]:
# Ada Boosting
from sklearn.ensemble import AdaBoostRegressor
ab_base = AdaBoostRegressor(n_estimators=1000, learning_rate=0.1)
ab_base.fit(np.array(X_train), y_train)
y_pred_ab = ab_base.predict(np.array(X_test))
ab_r2 = metrics.r2_score(y_test, y_pred_ab)
print(f"AdaBoost Base Model: {ab_r2}")

In [None]:
# Support vector regressor
svr_base = SVR(kernel='rbf', C=1.0, epsilon=0.2)
svr_base.fit(np.array(X_train), y_train)
y_pred_svr = svr_base.predict(np.array(X_test))
svr_r2 = metrics.r2_score(y_test, y_pred_svr)
print(f"Support Vector Regressor Base Model: {svr_r2}")

In [None]:
# Artificial Neural Network
from sklearn.neural_network import MLPRegressor
ann_base = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', alpha=0.0001, max_iter=1000, random_state=42)
ann_base.fit(X_train, y_train)
y_pred_ann = ann_base.predict(X_test)
ann_r2 = metrics.r2_score(y_test, y_pred_ann)
print(f"Artificial Neural Network Base Model: {ann_r2}")

In [None]:
# get the values for hyperparameter tuning
params = {
    "RandomForest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [int(x) for x in np.linspace(start=1, stop=1200, num=10)],
            "criterion": ["squared_error", "absolute_error"],
            "max_depth": [int(x) for x in np.linspace(start=1, stop=30, num=5)],
            "min_samples_split": [2,5,10,12],
            "min_samples_leaf": [2,5,10,12],
            "max_features": ["auto", "sqrt"],
            "ccp_alpha":[1,2,2.5,3,3.5,4,5]
        }
    },

    "DecisionTree": {
        "model": DecisionTreeRegressor(),
        "params": {
            "criterion": ["squared_error", "absolute_error"],
            "splitter": ["best", "random"],
            "min_samples_split": [1,2,5,10,12],
            "max_features": ["auto", "sqrt"],
            "ccp_alpha":[1,2,2.5,3,3.5,4,5]
        }
    },

    "SVM": {
        "model": SVR(),
        "params": {
            "C": [0.25, 0.5, 0.75, 1.0],
            "tol": [1e-10, 1e-5, 1e-4, 0.025, 0.50, 0.75],
            "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
            "max_iter": [int(x) for x in np.linspace(start=1, stop=250, num=10)]
        }
    },
    "AdaBoost" : {
    "model": AdaBoostRegressor(),
    "params": {
        "n_estimators": [50, 100, 200, 500],
        "learning_rate": [0.01, 0.1, 1.0],
    }
},
    "GradientBoosting" : {
    "model": GradientBoostingRegressor(),
    "params": {
        "n_estimators": [50, 100, 200, 500],
        "learning_rate": [0.01, 0.1, 1.0],
        "max_depth": [3, 4, 5, 6],
    }
    },
    "NeuralNetwork" : {
    "model": MLPRegressor(),
    "params": {
        "hidden_layer_sizes": [(50, 25), (100, 50), (200, 100)],
        "activation": ['relu', 'tanh'],
        "alpha": [0.0001, 0.001, 0.01],
        "max_iter": [500, 1000],
    }
}
}

In [None]:
# Execute RandomizedSearchCV and get best parameters
scores = []
for model_name, model_params in params.items():
    clf = RandomizedSearchCV(model_params["model"],
                            param_distributions=model_params["params"],
                            cv=5, n_jobs=-1, n_iter=10,
                            scoring="neg_mean_squared_error")

    clf.fit(X_train, y_train)
    scores.append({
        "model_name": model_name,
        "best_score": clf.best_score_,
        "best_estimator": clf.best_estimator_
    })

In [None]:
# view scores
scores


In [None]:
# create a data frame of the scores
scored_df = pd.DataFrame(scores, columns=["model_name",
                                          "best_score",
                                          "best_estimator"])

scored_df

In [None]:
scores[4]["best_estimator"]


In [None]:
# train the model with best parameters
gb = scores[4]["best_estimator"]
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

print(f"Gradient Boosting  hyperparameter Model: {metrics.r2_score(y_test, y_pred)} ")

In [None]:
# Now we will train the best model with the entire data set
to_train_list = np.array(to_train)

predicted = []
for i in range(len(to_train_list)):
    predicted.append(gb_base.predict(to_train_list[i].reshape(1, -1)))

to_train["Actual"] = y
to_train["Predicted Result"] = np.array(predicted)
to_train

In [None]:
# plot the kde for both the actual and predicted results
sns.kdeplot(to_train["Actual"],
            label="Actual Result",
            color="blue")
sns.kdeplot(to_train["Predicted Result"],
            label="Predicted Result",
            color="red")
plt.legend();

In [None]:
# save the model
import pickle
file = open("body_fat_model.pkl", "wb")
pickle.dump(gb_base, file)
file.close()