<b>Dependencies:</b> <br>
    import pandas as pd <br><br>
    from sklearn.ensemble import RandomForestClassifier <br>
    from sklearn.model_selection import train_test_split <br>
    from sklearn.metrics import accuracy_score <br>
    from sklearn.metrics import precision_score <br>
    from sklearn.metrics import recall_score <br>
    from sklearn.metrics import f1_score <br>


In [1]:
# RANDOM FOREST FEATURE SELECTION 80-20

import pandas as pd

# Models
from sklearn.ensemble import RandomForestClassifier

# Evaluation methods
# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_training = pd.read_csv('../datasets/diabetes_train_data_80pc.csv')
diabetes_test = pd.read_csv('../datasets/diabetes_test_data_20pc.csv')
selected_features = ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X_train = diabetes_training[selected_features]
y_train = diabetes_training.Outcome
X_test = diabetes_test[selected_features]
y_test = diabetes_test.Outcome
random_seed = 3


model = RandomForestClassifier(random_state=random_seed)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = (accuracy_score(y_test, y_pred))
precision = (precision_score(y_test, y_pred))
recall = (recall_score(y_test, y_pred))
f1 = (f1_score(y_test, y_pred))

print("Train/Test Split after Feature Selection")
results_train_test = pd.DataFrame({'Model': ["RF"],
                                   'Accuracy': [accuracy],
                                   'Precision ': [precision],
                                   'Recall': [recall],
                                   'f1': [f1],})
print(results_train_test, "\n\n")



Train/Test Split after Feature Selection
  Model  Accuracy  Precision     Recall        f1
0    RF  0.724138    0.636364  0.538462  0.583333 




In [1]:
# AVERAGE RANDOM FOREST FEATURE SELECTION 80-20

import pandas as pd
import random

# Models
from sklearn.ensemble import RandomForestClassifier

# Evaluation methods
from sklearn.model_selection import train_test_split

# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_cleaned = pd.read_csv('../datasets/diabetes_cleaned.csv')
selected_features = ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = diabetes_cleaned[selected_features]
y = diabetes_cleaned.Outcome
accuracy = []
precision = []
recall = []
f1 = []

times_repeated = 100
for _ in range(times_repeated):
    random_seed = random.randint(0, 1000)

    model = RandomForestClassifier(random_state=random_seed)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        stratify = diabetes_cleaned.Outcome, random_state=random_seed)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy.append(accuracy_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))


print("Average Train/Test Split after Feature Selection")
results_train_test = pd.DataFrame({'Model': ["RF"],
                                   'Accuracy': [sum(accuracy)/len(accuracy)],
                                   'Precision ': [sum(precision)/len(precision)],
                                   'Recall': [sum(recall)/len(recall)],
                                   'f1': [sum(f1)/len(f1)],})
print(results_train_test, "\n\n")



Average Train/Test Split after Feature Selection
  Model  Accuracy  Precision   Recall        f1
0    RF  0.760483    0.666258  0.6184  0.639603 




In [3]:
# RANDOM FOREST DATA AUGMENTATION 80-20

import pandas as pd

# Models
from sklearn.ensemble import RandomForestClassifier

# Evaluation methods
# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_da_training = pd.read_csv('../datasets/diabetes_train_data_80pc_100times_10.csv')
diabetes_da_test = pd.read_csv('../datasets/diabetes_test_data_20pc.csv')
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X_train = diabetes_da_training[features]
y_train = diabetes_da_training.Outcome
X_test = diabetes_da_test[features]
y_test = diabetes_da_test.Outcome
random_seed = 3

model = RandomForestClassifier(random_state=random_seed)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = (accuracy_score(y_test, y_pred))
precision = (precision_score(y_test, y_pred))
recall = (recall_score(y_test, y_pred))
f1 = (f1_score(y_test, y_pred))

print("Train/Test Split after Data Augmentation")
results_train_test = pd.DataFrame({'Model': ["RF"],
                                   'Accuracy': [accuracy],
                                   'Precision ': [precision],
                                   'Recall': [recall],
                                   'f1': [f1],})
print(results_train_test, "\n\n")



Train/Test Split after Data Augmentation
  Model  Accuracy  Precision     Recall        f1
0    RF  0.744828    0.674419  0.557692  0.610526 




In [6]:
# RANDOM FOREST FEATURE SELECTION + DATA AUGMENTATION 80-20

import pandas as pd

# Models
from sklearn.ensemble import RandomForestClassifier

# Evaluation methods
# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_da_training = pd.read_csv('../datasets/diabetes_train_data_80pc_100times_10.csv')
diabetes_da_test = pd.read_csv('../datasets/diabetes_test_data_20pc.csv')
selected_features = ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X_train = diabetes_da_training[selected_features]
y_train = diabetes_da_training.Outcome
X_test = diabetes_da_test[selected_features]
y_test = diabetes_da_test.Outcome
random_seed = 3

model = RandomForestClassifier(random_state=random_seed)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = (accuracy_score(y_test, y_pred))
precision = (precision_score(y_test, y_pred))
recall = (recall_score(y_test, y_pred))
f1 = (f1_score(y_test, y_pred))

print("Train/Test Split after Feature Selection and Data Augmentation")
results_train_test = pd.DataFrame({'Model': ["RF"],
                                   'Accuracy': [accuracy],
                                   'Precision ': [precision],
                                   'Recall': [recall],
                                   'f1': [f1],})
print(results_train_test, "\n\n")




Train/Test Split after Feature Selection and Data Augmentation
  Model  Accuracy  Precision     Recall        f1
0    RF  0.703448    0.595745  0.538462  0.565657 


