# module5_project_delivery

## Summary

### Setup Dependencies 

In [1]:
import numpy as np
import pandas as pd
import pickle
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.metrics import roc_auc_score

In [3]:
with open('final_model.pk1', 'rb') as f:
    model = pickle.load(f)

  from numpy.core.umath_tests import inner1d


### Confirm your Model was Saved Correctly

In [4]:
model

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.33, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_i...imators=100, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False))])

In [6]:
df = pd.read_csv('data/analytical_base_table.csv')

In [8]:
y = df.status

X = df.drop('status', axis=1)

# TODO: What is stratify?
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234, stratify=df.status)

In [10]:
prediction = model.predict_proba(X_test)
prediction = [p[1] for p in prediction]

print('AUROC: ', roc_auc_score(y_test,prediction))

(2814, 2)
AUROC:  0.9911901855307785


In [12]:
raw_data = pd.read_csv('data/unseen_raw_data.csv')
print(raw_data.shape)
raw_data.head()

(750, 9)


Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure
0,228,management,,0.735618,2,,high,0.805661,3.0
1,229,product,,1.0,4,,low,0.719961,4.0
2,196,sales,1.0,0.557426,4,,low,0.749835,2.0
3,207,IT,,0.715171,3,,high,0.987447,3.0
4,129,management,,0.484818,2,,low,0.441219,3.0


In [13]:
# should throw error on unseen raw data
prediction = model.predict_proba(raw_data)

ValueError: could not convert string to float: 'low'

### Write pre-modeling functions

In [21]:
def clean_data(df):
    df = df.drop_duplicates()
    
    # drop temporary worker
    df = df[df.department != 'temp']
    
    # fill empty values
    df['filed_complaint'] = df.filed_complaint.fillna(0)
    df['recently_promoted'] = df.recently_promoted.fillna(0)
    
    # replace label 
    df.department.replace('information_technology', 'IT', inplace=True)
    
    # Fill missing values in department with 'Missing'
    df['department'].fillna('Missing', inplace=True)
    
    # indicator variable for missing last_evaluation
    df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)
    
    # Fill missing values in last_evaluation with 0
    df.last_evaluation.fillna(0, inplace=True)
    
    return df

In [22]:
cleaned_data = clean_data(raw_data)

cleaned_data.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure,last_evaluation_missing
0,228,management,0.0,0.735618,2,0.0,high,0.805661,3.0,0
1,229,product,0.0,1.0,4,0.0,low,0.719961,4.0,0
2,196,sales,1.0,0.557426,4,0.0,low,0.749835,2.0,0
3,207,IT,0.0,0.715171,3,0.0,high,0.987447,3.0,0
4,129,management,0.0,0.484818,2,0.0,low,0.441219,3.0,0


In [24]:
def enfineer_features(df):
    df['underperformer'] = ((df.last_evaluation < 0.6) & (df.last_evaluation_missing == 0)).astype(int)
    df['unhappy'] = (df.satisfaction < 0.2).astype(int)
    df['overachiever'] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(int)
    
    df = pd.get_dummies(df, columns=['department', 'salary'])

    return df

In [25]:
augmented_data = enfineer_features(cleaned_data)

augmented_data.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,satisfaction,tenure,last_evaluation_missing,underperformer,unhappy,...,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support,salary_high,salary_low,salary_medium
0,228,0.0,0.735618,2,0.0,0.805661,3.0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,229,0.0,1.0,4,0.0,0.719961,4.0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,196,1.0,0.557426,4,0.0,0.749835,2.0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
3,207,0.0,0.715171,3,0.0,0.987447,3.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,129,0.0,0.484818,2,0.0,0.441219,3.0,0,1,0,...,0,1,0,0,0,0,0,0,1,0


In [26]:
prediction = model.predict_proba(augmented_data)
print(prediction[:5])

[[1.   0.  ]
 [0.96 0.04]
 [1.   0.  ]
 [1.   0.  ]
 [0.   1.  ]]


### Construct a model class

In [30]:
class EmployeeRetentionModel:
    
    def __init__(self, model_location):
        with open(model_location, 'rb') as f:
            self.model = pickle.load(f)
    
    def predict_proba(self, X_new, clean=True, augment=True):
        if clean:
            X_new = self.clean_data(X_new)
        
        if augment:
            X_new = self.engineer_features(X_new)
            
        return X_new, self.model.predict_proba(X_new)
    
    def clean_data(self, df):
        # Drop duplicates
        df = df.drop_duplicates()

        # Drop temporary workers
        df = df[df.department != 'temp']

        # Missing filed_complaint values should be 0
        df['filed_complaint'] = df.filed_complaint.fillna(0)

        # Missing recently_promoted values should be 0
        df['recently_promoted'] = df.recently_promoted.fillna(0)

        # 'information_technology' should be 'IT'
        df.department.replace('information_technology', 'IT', inplace=True)

        # Fill missing values in department with 'Missing'
        df['department'].fillna('Missing', inplace=True)

        # Indicator variable for missing last_evaluation
        df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)

        # Fill missing values in last_evaluation with 0
        df.last_evaluation.fillna(0, inplace=True)

        # Return cleaned dataframe
        return df
    
    def engineer_features(self,df):
        # Create indicator features
        df['underperformer'] = ((df.last_evaluation < 0.6) & (df.last_evaluation_missing == 0)).astype(int)
        df['unhappy'] = (df.satisfaction < 0.2).astype(int)
        df['overachiever'] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(int)

        # Create new dataframe with dummy features
        df = pd.get_dummies(df, columns=['department', 'salary'])

        # Return augmented DataFrame
        return df

### Model Usage as class

In [31]:
retention_model = EmployeeRetentionModel('final_model.pk1')

In [35]:
# Predict raw data
_, pred1 = retention_model.predict_proba(raw_data, clean=True, augment=True)
print(pred1)
# Predict cleaned data
_, pred2 = retention_model.predict_proba(cleaned_data, clean=False, augment=True)
print(pred2)
# Predict cleaned and augmented data
_, pred3 = retention_model.predict_proba(augmented_data, clean=False, augment=False)
print(pred3)

# By the way, , pred1 = simply means we're throwing away the first object that's returned (which was Xnew).

[[1.   0.  ]
 [0.96 0.04]
 [1.   0.  ]
 ...
 [0.97 0.03]
 [0.99 0.01]
 [1.   0.  ]]
[[1.   0.  ]
 [0.96 0.04]
 [1.   0.  ]
 ...
 [0.97 0.03]
 [0.99 0.01]
 [1.   0.  ]]
[[1.   0.  ]
 [0.96 0.04]
 [1.   0.  ]
 ...
 [0.97 0.03]
 [0.99 0.01]
 [1.   0.  ]]


In [36]:
# Should be true
np.array_equal(pred1, pred2) and np.array_equal(pred2, pred3)

True