# LabelEncode Target Class

#### Various Classifiers
- Random Forest Classifier
- Extra Trees Classifier
- Balanced Random Forest Classifier
- Gradient Boosted Tree Classifier



In [9]:
#Import Dependencies

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [10]:
#SOURCE: https://www.tutorialspoint.com/python_data_access/python_postgresql_database_connection.htm
import psycopg2

#establishing the connection
conn = psycopg2.connect(
   database="Body_Performance", user='postgres', password='passwordhere', host='localhost', port= '5432'
)
#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Executing an MYSQL function using the execute() method
cursor.execute("select version()")

# Fetch a single row using fetchone() method.
data = cursor.fetchone()
print("Connection established to: ",data)



Connection established to:  ('PostgreSQL 12.13, compiled by Visual C++ build 1914, 64-bit',)


In [11]:
body_df = pd.read_sql('select * from body_performance', con=conn)
body_df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body_fat_pct,diastolic,systolic,gripforce,sit_and_bend_forward_cm,sit_ups_counts,broad_jump_cm,ranking
0,27,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [12]:
#Closing the connection
conn.close()


In [14]:
#LabelEncoder on gender column
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

body_df['gender_num'] = labelencoder.fit_transform(body_df['gender'])

body_df.head(10)


Unnamed: 0,age,gender,height_cm,weight_kg,body_fat_pct,diastolic,systolic,gripforce,sit_and_bend_forward_cm,sit_ups_counts,broad_jump_cm,ranking,gender_num
0,27,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C,1
1,25,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A,1
2,31,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C,1
3,32,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B,1
4,28,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B,1
5,36,F,165.4,55.4,22.0,64.0,119.0,23.8,21.0,27.0,153.0,B,0
6,42,F,164.5,63.7,32.2,72.0,135.0,22.7,0.8,18.0,146.0,D,0
7,33,M,174.9,77.2,36.9,84.0,137.0,45.9,12.3,42.0,234.0,B,1
8,54,M,166.8,67.5,27.6,85.0,165.0,40.4,18.6,34.0,148.0,C,1
9,28,M,185.0,84.6,14.4,81.0,156.0,57.9,12.1,55.0,213.0,B,1


In [15]:
# LabelEncoder on Target values ("ranking") since we have 4 classes but keeping in one column
# See bodyperf_ml_v1.3 for OneHotEncode

body_df['ranking_num'] = labelencoder.fit_transform(body_df['ranking'])

body_df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body_fat_pct,diastolic,systolic,gripforce,sit_and_bend_forward_cm,sit_ups_counts,broad_jump_cm,ranking,gender_num,ranking_num
0,27,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C,1,2
1,25,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A,1,0
2,31,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C,1,2
3,32,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B,1,1
4,28,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B,1,1


In [16]:
age_count = body_df.value_counts("age")
age_count

age
21    964
22    789
23    668
25    644
26    629
24    617
27    546
28    527
29    407
30    374
60    368
31    338
37    282
32    275
62    265
61    254
39    243
36    239
38    230
63    230
34    229
33    229
35    223
41    217
64    215
40    211
56    197
48    196
43    192
59    192
51    191
50    189
55    185
45    183
57    181
46    181
58    180
42    179
47    172
44    172
49    163
54    152
52    140
53    135
dtype: int64

In [17]:
#Drop non-encoded original columns

body_df = body_df.drop(["gender", "ranking"], axis=1)

body_df.head()

Unnamed: 0,age,height_cm,weight_kg,body_fat_pct,diastolic,systolic,gripforce,sit_and_bend_forward_cm,sit_ups_counts,broad_jump_cm,gender_num,ranking_num
0,27,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,1,2
1,25,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,1,0
2,31,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,1,2
3,32,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,1,1
4,28,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,1,1


In [18]:
#Set X, y 
y = body_df["ranking_num"]
X = body_df.drop("ranking_num", axis=1)

print(y.shape)
X.head()

(13393,)


Unnamed: 0,age,height_cm,weight_kg,body_fat_pct,diastolic,systolic,gripforce,sit_and_bend_forward_cm,sit_ups_counts,broad_jump_cm,gender_num
0,27,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,1
1,25,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,1
2,31,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,1
3,32,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,1
4,28,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,1


In [19]:
y[:10]

0    2
1    0
2    2
3    1
4    1
5    1
6    3
7    1
8    2
9    1
Name: ranking_num, dtype: int32

In [20]:
#Split train/test data and scale using Standard Scaler at this time (mean of 0).  #Should we use MinMax(0-1)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=.75)

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.shape)
print(X_test_scaled.shape)

(10044, 11)
(3349, 11)


### Random Forest Classifier

In [21]:
#Trying RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {rf_model.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rf_model.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.7438041206330248


In [22]:
feature_importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.25568721293280927, 'sit_and_bend_forward_cm'),
 (0.1332609615864456, 'sit_ups_counts'),
 (0.09736701275413889, 'body_fat_pct'),
 (0.08913121752984346, 'age'),
 (0.08810826156992316, 'weight_kg'),
 (0.08163774145935258, 'gripforce'),
 (0.07531938338188414, 'broad_jump_cm'),
 (0.06514319085002566, 'height_cm'),
 (0.05205952211176057, 'systolic'),
 (0.049233357446716496, 'diastolic'),
 (0.013052138377100229, 'gender_num')]

In [23]:
#Accuracy score and confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

predictions = rf_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7438041206330248


In [24]:
# printing confusion matrix      
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3"],
    columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3"])
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,708,101,7,2
Actual 1,179,552,90,24
Actual 2,74,165,540,56
Actual 3,10,55,95,691


In [25]:
#Classification report for random forest

print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.73      0.87      0.79       818
           1       0.63      0.65      0.64       845
           2       0.74      0.65      0.69       835
           3       0.89      0.81      0.85       851

    accuracy                           0.74      3349
   macro avg       0.75      0.74      0.74      3349
weighted avg       0.75      0.74      0.74      3349



### Extra Trees Classifier

In [27]:
# Import an Extremely Random Trees classifier
from sklearn.ensemble import ExtraTreesClassifier

et_clf = ExtraTreesClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

print(f'Training Score: {et_clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {et_clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.7285756942370857


In [28]:
predictions2 = et_clf.predict(X_test_scaled)
acc_score2 = accuracy_score(y_test, predictions2)
print(f"Accuracy Score : {acc_score2}")

print("Classification Report")
print(classification_report(y_test, predictions2))

Accuracy Score : 0.7285756942370857
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.85      0.78       818
           1       0.61      0.60      0.60       845
           2       0.71      0.65      0.68       835
           3       0.89      0.82      0.85       851

    accuracy                           0.73      3349
   macro avg       0.73      0.73      0.73      3349
weighted avg       0.73      0.73      0.73      3349



In [38]:
#Confusion matrix for new model
et_cm = confusion_matrix(y_test, predictions2)
et_cmdf = pd.DataFrame(
    et_cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3"],
    columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3"])
display(et_cmdf)

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,697,111,7,3
Actual 1,196,503,122,24
Actual 2,77,156,542,60
Actual 3,9,51,93,698


### Balanced Random Forest Classifier

In [40]:
#Test with resampling method using balanced rfc
from imblearn.ensemble import BalancedRandomForestClassifier

brfc_model = BalancedRandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
y_pred3 = brfc_model.predict(X_test_scaled)

print(f'Training Score: {brfc_model.score(X_train_scaled, y_train)}')
print(f'Testing Score: {brfc_model.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.7476858763810093


In [41]:
acc_score3 = accuracy_score(y_test, y_pred3)
print(f"Accuracy Score : {acc_score3}")

print("Classification Report")
print(classification_report(y_test, y_pred3))

Accuracy Score : 0.7476858763810093
Classification Report
              precision    recall  f1-score   support

           0       0.73      0.87      0.79       818
           1       0.64      0.65      0.64       845
           2       0.74      0.66      0.70       835
           3       0.90      0.82      0.85       851

    accuracy                           0.75      3349
   macro avg       0.75      0.75      0.75      3349
weighted avg       0.75      0.75      0.75      3349



In [37]:
#Confusion matrix for new model
et_cm = confusion_matrix(y_test, y_pred3)
et_cmdf = pd.DataFrame(
    et_cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3"],
    columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3"])
display(et_cmdf)

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,710,98,6,4
Actual 1,181,547,93,24
Actual 2,69,161,552,53
Actual 3,10,50,96,695


### Gradient Booster Classifier

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

y_pred4 = gb_clf.predict(X_test_scaled)

print(f'Training Score: {gb_clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {gb_clf.score(X_test_scaled, y_test)}')


Training Score: 0.879231381919554
Testing Score: 0.732756046581069


In [34]:
acc_score4 = accuracy_score(y_test, y_pred4)
print(f"Accuracy Score : {acc_score4}")

print("Classification Report")
print(classification_report(y_test, y_pred4))

Accuracy Score : 0.732756046581069
Classification Report
              precision    recall  f1-score   support

           0       0.74      0.86      0.79       818
           1       0.61      0.61      0.61       845
           2       0.69      0.65      0.67       835
           3       0.90      0.82      0.86       851

    accuracy                           0.73      3349
   macro avg       0.74      0.73      0.73      3349
weighted avg       0.74      0.73      0.73      3349



In [36]:
#Confusion matrix for new model
gb_cm = confusion_matrix(y_test, y_pred4)
gb_cmdf = pd.DataFrame(
    gb_cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3"],
    columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3"])
display(gb_cmdf)

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,701,101,13,3
Actual 1,176,514,132,23
Actual 2,66,174,543,52
Actual 3,10,51,94,696
