In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz
from graphviz import Graph
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import plotly.express as px
from datetime import date 
from wrangle import new_city_data, clean_city, missing_zero_values_table, train_validate_test_split
import explore

from model import run_model

In [2]:
df = new_city_data()
df = clean_city(df)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11910 entries, 12 to 11923
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   annual_salary_2016         11910 non-null  float64
 1   base_pay_2016              11910 non-null  float64
 2   leave_payout_2016          11910 non-null  float64
 3   other_2016                 11910 non-null  float64
 4   overtime_2016              11910 non-null  float64
 5   gross_earnings_2016        11910 non-null  float64
 6   additional_compensation    11910 non-null  float64
 7   total_compensation         11910 non-null  float64
 8   department                 11910 non-null  object 
 9   gender                     11910 non-null  object 
 10  ethnicity                  11910 non-null  object 
 11  years_employed             11910 non-null  int64  
 12  job_id                     11910 non-null  object 
 13  job_name                   11910 non-null  ob

In [4]:
train, validate, test = train_validate_test_split(df, seed=123)

In [5]:
# Seperate out our X and y values
X_train = train.drop(columns=['gender', 'department', 'ethnicity', 'job_name', 'job_id'])
y_train = train.gender

X_validate = validate.drop(columns=['gender', 'department', 'ethnicity', 'job_name', 'job_id'])
y_validate = validate.gender

X_test = test.drop(columns=['gender', 'department', 'ethnicity', 'job_name', 'job_id'])
y_test = test.gender

-----

<h3>Establish the Baseline</h3>

------

In [6]:
# The most frequenly observed outcome will be our baseline

train.gender.value_counts()

MALE      4355
FEMALE    2314
Name: gender, dtype: int64

In [7]:
baseline_accuracy = (round((train.gender == 'MALE').mean(), 2)* 100)

print(f'Our baseline accuracy is {baseline_accuracy}%')

Our baseline accuracy is 65.0%


-------

<h3>Logistic Regression Model</h3>

--------

In [8]:
# Create the logistic regression
logit = LogisticRegression(random_state=123)

# specify the features we're using
features = ['annual_salary_2016','base_pay_2016', 'leave_payout_2016', 'other_2016', 'overtime_2016', 'additional_compensation', 'total_compensation',
           'years_employed', 'ethnicity_ASIAN', 'ethnicity_BLACK', 'ethnicity_HISPANIC', 'ethnicity_NATIVE AMERICAN', 'ethnicity_NATIVE HAWAIIAN',
           'ethnicity_OTHER', 'ethnicity_WHITE']
# Fit a model using only these specified features
# logit.fit(X_train[["age", "pclass", "fare"]], y_train)
logit.fit(X_train[features], y_train)

# Since we .fit on a subset, we .predict on that same subset of features
y_pred = logit.predict(X_train[features])

print("Baseline is", round(baseline_accuracy, 2))
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train[features], y_train)))

Baseline is 65.0
Accuracy of Logistic Regression classifier on training set: 0.70


In [9]:
# Let's determine logit1's metrics on validate
features = ['annual_salary_2016','base_pay_2016', 'leave_payout_2016', 'other_2016', 'overtime_2016', 'additional_compensation', 'total_compensation',
           'years_employed', 'ethnicity_ASIAN', 'ethnicity_BLACK', 'ethnicity_HISPANIC', 'ethnicity_NATIVE AMERICAN', 'ethnicity_NATIVE HAWAIIAN',
           'ethnicity_OTHER', 'ethnicity_WHITE']
y_pred = logit.predict(X_validate[features])

print('Logit1 model using salary data, ethnicity, and years employed')
print(classification_report(y_validate, y_pred))

Logit1 model using salary data, ethnicity, and years employed
              precision    recall  f1-score   support

      FEMALE       0.58      0.45      0.51       992
        MALE       0.74      0.83      0.78      1867

    accuracy                           0.70      2859
   macro avg       0.66      0.64      0.64      2859
weighted avg       0.68      0.70      0.69      2859



-------

<h3>KNN Model</h3>

--------

In [10]:
# Now let's make the model
knn = KNeighborsClassifier()

In [11]:
features = ['annual_salary_2016','base_pay_2016', 'leave_payout_2016', 'other_2016', 'overtime_2016', 'additional_compensation', 'total_compensation',
           'years_employed', 'ethnicity_ASIAN', 'ethnicity_BLACK', 'ethnicity_HISPANIC', 'ethnicity_NATIVE AMERICAN', 'ethnicity_NATIVE HAWAIIAN',
           'ethnicity_OTHER', 'ethnicity_WHITE']

In [12]:
# Now let's train the model!
knn.fit(X_train[features], y_train)

# Let's check the accuracy
accuracy = knn.score(X_train[features], y_train)
print(f"accuracy is {accuracy:.3}")

accuracy is 0.799


In [13]:
# Evaluate the model
y_pred = knn.predict(X_train[features])

In [14]:
# Let's check our other classification metrics
# y_train is the actual labels for the target variable
# y_pred is the predictions that the model makes based off our X features
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

      FEMALE       0.72      0.69      0.70      2314
        MALE       0.84      0.86      0.85      4355

    accuracy                           0.80      6669
   macro avg       0.78      0.77      0.78      6669
weighted avg       0.80      0.80      0.80      6669



In [15]:
# Let's check the accuracy
accuracy = knn.score(X_train[features], y_train)
print(f"accuracy is {accuracy:.3}")

accuracy is 0.799


features = ['annual_salary_2016','base_pay_2016', 'leave_payout_2016', 'other_2016', 'overtime_2016', 'additional_compensation', 'total_compensation',
           'years_employed', 'ethnicity_ASIAN', 'ethnicity_BLACK', 'ethnicity_HISPANIC', 'ethnicity_NATIVE AMERICAN', 'ethnicity_NATIVE HAWAIIAN',
           'ethnicity_OTHER', 'ethnicity_WHITE']

-------

<h3>Random Forest</h3>

--------

In [16]:
features = ['annual_salary_2016','base_pay_2016', 'leave_payout_2016', 'other_2016', 'overtime_2016', 'additional_compensation', 'total_compensation',
           'years_employed', 'ethnicity_ASIAN', 'ethnicity_BLACK', 'ethnicity_HISPANIC', 'ethnicity_NATIVE AMERICAN', 'ethnicity_NATIVE HAWAIIAN',
           'ethnicity_OTHER', 'ethnicity_WHITE']

In [17]:
def get_metrics_binary(rf):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = rf.score(X_train[features], y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [18]:
# create the Random Forest Model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [19]:
rf.fit(X_train[features], y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [20]:
y_pred = rf.predict(X_train[features])

In [21]:
y_pred_proba = rf.predict_proba(X_train[features])

In [22]:
class_report = get_metrics_binary(rf)
class_report


    The accuracy for our model is 0.8481
    The True Positive Rate is 0.879, The False Positive Rate is 0.21,
    The True Negative Rate is 0.79, and the False Negative Rate is 0.121
    


Unnamed: 0,precision,recall,f1-score,support
FEMALE,0.776456,0.789542,0.782944,2314.0
MALE,0.887164,0.879219,0.883174,4355.0
accuracy,0.848103,0.848103,0.848103,0.848103
macro avg,0.83181,0.834381,0.833059,6669.0
weighted avg,0.848751,0.848103,0.848396,6669.0


In [23]:
print('Accuracy: {:.2f}'.format(rf.score(X_validate[features], y_validate)))

y_pred = rf.predict(X_validate[features])



print(classification_report(y_validate, y_pred))


Accuracy: 0.75
              precision    recall  f1-score   support

      FEMALE       0.64      0.61      0.62       992
        MALE       0.80      0.82      0.81      1867

    accuracy                           0.75      2859
   macro avg       0.72      0.71      0.72      2859
weighted avg       0.74      0.75      0.74      2859



-------

<h3>Decision Tree</h3>

--------

In [24]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
# clf = DecisionTreeClassifier(max_depth=3, random_state=123)

clf = DecisionTreeClassifier(max_depth=7, random_state=123)

# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf


DecisionTreeClassifier(max_depth=7, random_state=123)

In [25]:
# Visualize the model so it can explain itself!

dot_data = export_graphviz(clf, feature_names= X_train.columns,rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('COSA_salary_decision_tree', view=True, format="pdf")

'COSA_salary_decision_tree.pdf'

In [26]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)

# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)

In [27]:
# Let's evaluate the model
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.77


In [28]:
class_report = classification_report(y_train, y_pred, output_dict=True)
pd.DataFrame(class_report).T

Unnamed: 0,precision,recall,f1-score,support
FEMALE,0.693308,0.622299,0.655887,2314.0
MALE,0.809669,0.853731,0.831117,4355.0
accuracy,0.773429,0.773429,0.773429,0.773429
macro avg,0.751488,0.738015,0.743502,6669.0
weighted avg,0.769294,0.773429,0.770316,6669.0


In [29]:
y_pred = clf.predict(X_validate)

In [30]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

      FEMALE       0.64      0.56      0.60       992
        MALE       0.78      0.83      0.80      1867

    accuracy                           0.74      2859
   macro avg       0.71      0.70      0.70      2859
weighted avg       0.73      0.74      0.73      2859



In [31]:

logreg_train_accuracy = logit.score(X_train[features], y_train)
decisiontree_train_accuracy = clf.score(X_train, y_train)
random_forest_train_accuracy = rf.score(X_train[features], y_train)
knn_train_accuracy = knn.score(X_train[features], y_train)

logreg_validate_accuracy = logit.score(X_validate[features], y_validate)
decisiontree_validate_accuracy = clf.score(X_validate, y_validate)
random_forest_validate_accuracy = rf.score(X_validate[features], y_validate)
knn_validate_accuracy = knn.score(X_validate[features], y_validate)
print('\n')
print(f'The baseline accuracy is {baseline_accuracy}%')
print('\n')

print(f"Logistic Regression Train Accuracy --- {logreg_train_accuracy:.2%}")
print(f"Decision Tree Train Accuracy --- {decisiontree_train_accuracy:.2%}")
print(f"Random Forest Train Accuracy --- {random_forest_train_accuracy:.2%}")
print(f"KNN Accuracy Train Accuracy --- {knn_train_accuracy:.2%}")
print('\n')
print(f"Logistic Regression Validate Accuracy --- {logreg_validate_accuracy:.2%}")
print(f"Decision Tree Validate Accuracy --- {decisiontree_validate_accuracy:.2%}")
print(f"Random Forest Validate Accuracy --- {random_forest_validate_accuracy:.2%}")
print(f"KNN Validate Accuracy --- {knn_validate_accuracy:.2%}")



The baseline accuracy is 65.0%


Logistic Regression Train Accuracy --- 69.74%
Decision Tree Train Accuracy --- 77.34%
Random Forest Train Accuracy --- 84.81%
KNN Accuracy Train Accuracy --- 79.94%


Logistic Regression Validate Accuracy --- 69.67%
Decision Tree Validate Accuracy --- 73.70%
Random Forest Validate Accuracy --- 74.57%
KNN Validate Accuracy --- 71.18%


In [32]:
random_forest_test_accuracy = rf.score(X_test[features], y_test)
print(f"Random Forest Accuracy --- {random_forest_test_accuracy:.2%}")

Random Forest Accuracy --- 75.61%


In [34]:
run_model()

Unnamed: 0_level_0,Actual_Gender,Model_Predictions,Model_Probabilities
REF #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10813,FEMALE,FEMALE,0.317292
1100,MALE,MALE,0.989238
5554,MALE,MALE,0.758551
10933,FEMALE,FEMALE,0.281459
2643,MALE,MALE,0.624311
...,...,...,...
7535,MALE,MALE,0.814952
6509,FEMALE,FEMALE,0.277255
11837,MALE,MALE,0.665670
93,MALE,MALE,0.955197
