In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('bodyPerformance.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info

In [None]:
#checking the missing values
df.isnull().sum()

In [None]:
print(df.info())

In [None]:
SexColumnDummy = pd.get_dummies(df['gender'])

In [None]:
df = pd.concat((df, SexColumnDummy), axis=1)

In [None]:
df = df.drop(['gender'],axis=1)

In [None]:
df['class'] = df['class'].replace({"A" : 1,
                                       "B" : 2,
                                       "C" : 3,
                                       "D" : 4})

In [None]:
df.head()

In [None]:
df.corr()['class']

In [None]:
#correlation plot using my new dataframe
plt.figure(figsize=[7,7])
sns.heatmap(df.corr(), annot= True, fmt= '.0%')

In [None]:
df.plot(kind="box",subplots=True,layout=(7,2),figsize=(15,20));

In [None]:
sns.displot(df['sit and bend forward_cm'])

In [None]:
q1 = df['sit and bend forward_cm'].quantile(0.25)
q3 = df['sit and bend forward_cm'].quantile(0.75)
iqr = q3-q1

In [None]:
q1,q3,iqr

In [None]:
upper_limit = q3 + (1.5 * iqr)
lower_limit = q1 - (1.5 * iqr)
lower_limit, upper_limit

In [None]:
#find the outliers
df.loc[(df['sit and bend forward_cm'] > upper_limit) | (df['sit and bend forward_cm'] < lower_limit)]

In [None]:
#trimming - delete the outlier data
new_df = df.loc[(df['sit and bend forward_cm'] < upper_limit) & (df['sit and bend forward_cm'] > lower_limit)]
print('before removing outliers:', len(df))
print('After removing outliers:', len(new_df))
print('outliers:', len(df)-len(new_df))

In [None]:
#capping - change the outlier to upper lower limit values
new_df = df.copy()
new_df.loc[(new_df['sit and bend forward_cm']>upper_limit),'sit and bend forward_cm'] = upper_limit
new_df.loc[(new_df['sit and bend forward_cm']<lower_limit),'sit and bend forward_cm'] = lower_limit

In [None]:
sns.boxplot(new_df['sit and bend forward_cm'])

Model Building

In [None]:
X =new_df.drop('class',axis=1)
Y=new_df['class']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)

In [None]:
X_train.shape,X_test.shape

In [None]:
def model_acc(model):
    model.fit(X_train,Y_train)
    acc=model.score(X_test,Y_test)
    print(str(model)+ '-->'+str(acc))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
model_acc(rf)

from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
model_acc(nb_clf)

from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
model_acc(knn_clf)

from sklearn import ensemble
gb_clf = ensemble.GradientBoostingClassifier()
model_acc(gb_clf)

from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
model_acc(lr_clf)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'n_estimators':[10, 50,100],'criterion':['gini','entropy']}
grid_obj = GridSearchCV(estimator=rf, param_grid=parameters)
grid_fit = grid_obj.fit(X_train,Y_train)
best_model = grid_fit.best_estimator_
best_model

In [None]:
best_model.score(X_train,Y_train)

In [None]:
import pickle
with open('bodyperformance.pkl','wb') as file:
    pickle.dump(grid_fit, file)

In [None]:
X_train.columns

In [None]:
best_model.predict([[28,173.8,67.70,17.1,70.0,127,43.5,27.1,45,217,0,1]])