# RandomForestClassifier OneHotEncode Target Values

"y" is 4 columns binary

Tried 500 estimators
Then 200

In [50]:
#Import Dependencies

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf

In [51]:
#SOURCE: https://www.tutorialspoint.com/python_data_access/python_postgresql_database_connection.htm
import psycopg2

#establishing the connection
conn = psycopg2.connect(
   database="Body_Performance", user='postgres', password='Cheese6132', host='localhost', port= '5432'
)
#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Executing an MYSQL function using the execute() method
cursor.execute("select version()")

# Fetch a single row using fetchone() method.
data = cursor.fetchone()
print("Connection established to: ",data)



Connection established to:  ('PostgreSQL 12.13, compiled by Visual C++ build 1914, 64-bit',)


In [52]:
body_df = pd.read_sql('select * from body_performance', con=conn)
body_df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body_fat_pct,diastolic,systolic,gripforce,sit_and_bend_forward_cm,sit_ups_counts,broad_jump_cm,ranking
0,27,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [53]:
#Closing the connection
conn.close()


In [54]:
# OneHotEncoder or Get_dummies on Gender
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

encoded = pd.DataFrame(enc.fit_transform(body_df.gender.values.reshape(-1,1)))

encoded.columns = enc.get_feature_names_out(['gender'])
encoded.head()


Unnamed: 0,gender_F,gender_M
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [56]:
#Using oneHotEncoder for class into 4 separate

encode2 = pd.DataFrame(enc.fit_transform(body_df.ranking.values.reshape(-1,1)))

encode2.columns = enc.get_feature_names_out(['ranking'])
encode2.head()

Unnamed: 0,ranking_A,ranking_B,ranking_C,ranking_D
0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0


In [57]:
#Drop columns and merge encoded df
#Ignoring encode2 for now
body_df = body_df.drop(["gender", "ranking"], axis=1)

body_df2 = body_df.merge(encoded, left_index=True, right_index=True)
body_df2.head()


Unnamed: 0,age,height_cm,weight_kg,body_fat_pct,diastolic,systolic,gripforce,sit_and_bend_forward_cm,sit_ups_counts,broad_jump_cm,gender_F,gender_M
0,27,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,0.0,1.0
1,25,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,0.0,1.0
2,31,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,0.0,1.0
3,32,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,0.0,1.0
4,28,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,0.0,1.0


In [58]:
#Set X, y 
X = body_df2.copy()
y = encode2

print(X.shape, y.shape)

(13393, 12) (13393, 4)


In [65]:
#Split train/test data and scale using Standard Scaler at this time (mean of 0).  #Should we use MinMax(0-1)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.shape)
print(X_test_scaled.shape)

(10044, 12)
(3349, 12)


In [66]:
#Trying RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=1, n_estimators=2000).fit(X_train_scaled, y_train)
y_pred = rf_clf.predict(X_test_scaled)

print(f'Training Score: {rf_clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rf_clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.5816661690056734


In [67]:
feature_importances = rf_clf.feature_importances_
sorted(zip(rf_clf.feature_importances_, X.columns), reverse=True)

[(0.25265756891818464, 'sit_and_bend_forward_cm'),
 (0.13266872375975014, 'sit_ups_counts'),
 (0.0986801365441132, 'body_fat_pct'),
 (0.08810792835678868, 'age'),
 (0.0865026430498234, 'weight_kg'),
 (0.08145090683636741, 'gripforce'),
 (0.07581261570502851, 'broad_jump_cm'),
 (0.06405597953553088, 'height_cm'),
 (0.05283705904800945, 'systolic'),
 (0.0500826066030843, 'diastolic'),
 (0.008660151580930866, 'gender_M'),
 (0.008483680062388586, 'gender_F')]

In [68]:
#Accuracy score and confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

predictions = rf_clf.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.5816661690056734


In [69]:
# printing confusion matrix but error:      Shape of passed values is (4, 4), indices imply (2, 2) Needs to be resolved
cm = confusion_matrix(y_test.values.argmax(axis=1), predictions.argmax(axis=1))
#cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3"],
    columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3"])
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,803,15,0,0
Actual 1,508,292,36,9
Actual 2,341,55,410,29
Actual 3,136,11,50,654


In [70]:
#Classification report for random forest

print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       818
           1       0.78      0.35      0.48       845
           2       0.83      0.49      0.62       835
           3       0.95      0.77      0.85       851

   micro avg       0.85      0.58      0.69      3349
   macro avg       0.84      0.58      0.68      3349
weighted avg       0.84      0.58      0.68      3349
 samples avg       0.58      0.58      0.58      3349



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Need to test feature importances. THis is one way but plot is not showing up how I'd like. Need to adjust.

features = sorted(zip(X.columns, rf_model.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,200)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

In [None]:
# Binning of Age? if so will need to also be encoded/scaled.

# Open in Google Collab or here and import tensor flow to run as Neural Network

# Try MinMax Scaler  #Should I scale entire 

# Try with multiply Y columns as output. Can I do this with trees or only neural network?