In [71]:
#import just about everything that I'll need
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

df = pd.read_csv("survey_cleaner1.csv")

Cleaning data

In [73]:
df = df.drop(columns=["state"])

In [74]:
median = df["Age"].median()
df.columns

Index(['Unnamed: 0', 'Age', 'Gender', 'Country', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

In [75]:
# find the columns with nulls in it
df.info()
df.work_interfere.isnull().sum() #self employed has 18 null, #work_interfere has 263 null, check out options
df.work_interfere.unique()

# replace NaNs in work_intefere with "I Don't Know"
df.work_interfere.fillna("Don't Know",inplace=True)
# replace NaN's in self_employed with "No" because only 18 null and probably don't 
df.self_employed.fillna("No", inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 1253 non-null   int64 
 1   Age                        1253 non-null   int64 
 2   Gender                     1253 non-null   object
 3   Country                    1253 non-null   object
 4   self_employed              1235 non-null   object
 5   family_history             1253 non-null   object
 6   treatment                  1253 non-null   object
 7   work_interfere             990 non-null    object
 8   no_employees               1253 non-null   object
 9   remote_work                1253 non-null   object
 10  tech_company               1253 non-null   object
 11  benefits                   1253 non-null   object
 12  care_options               1253 non-null   object
 13  wellness_program           1253 non-null   object
 14  seek_hel

End of cleaning. 

**Modeling begins**

In [85]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Encode data with Label Encoder
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    list(le.classes_)

# Scale "Age" to be closer to the others
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

# Split data into features and label (X, y)
y = df.treatment # label/dependent variable
X = df.drop(columns=["treatment"]) # features/independent variables

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=56) # 70% training and 30% test

# Decision Tree classifier object
clf = DecisionTreeClassifier()

# Train Decision Tree classifer
clf = clf.fit(X_train,y_train)

# Try model on test dataset
y_predict = clf.predict(X_test)

Evaluating Model - how often was it correct

The following function evaluates:
* Classification accuracy: percentage of correct predictions
* Null accuracy: accuracy that could be achieved by always predicting the most frequent class
* Percentage of ones (treatment=1, did get treatment)
* Percentage of zeros (treatment=0, did not get treatment)
* Confusion matrix: Table that describes the performance of a classification model
    * True Positives (TP): we correctly predicted that the target variable occurred (treatment) i.e. the person did seek treatment for mental health
    * True Negatives (TN): we correctly predicted that the target variable DID NOT occur i.e. the person did NOT seek treatment
    * False Positives (FP): we predicted that they got treatment, when in fact they did not
    * False Negatives (FN): we predicted that they did not get treatment, when in fact they did
* False Positive Rate
* Precision of Positive value (??)
* AUC: the percentage of the ROC plot that is underneath the curve
    * 0.9-1 = excellent (A)
    * 0.8-0.9 = good (B)
    * 0.7-0.8 = okay (C)
    * 0.6-0.7 = poor (D)
    * 0.5-0.6 = fail (F)


In [None]:
def evaluate_model(model, y_test, y_pred, X_test, plot=False):
    # Classification accuracy: percentage of correct predictions
    class_accuracy = metric.accuracy_score(y_test, y_pred)
    print('Accuracy:', class_accuracy)
    
    # Null accuracy: accuracy that could be acheived by always predicting the most frequent classs
    # examine the class distribution of the testing set (using a Pandas Series method)
    print('Null accuracy:\n', y_test.value_counts())
    
    # percentage of ones i.e. people who got mental health treatment
    print('Percentage of people who got treatment:', y_test.mean())
    
    # percentage of zeros i.e. people who DID NOT get mental health treatment
    print('Percentage of people who did not get treatment:', 1-y_test.mean())
    
    # don't understand why it's [0:25] - just comparing first 25 responses?
    # Comparing the true and predicted response values
    print('True:', y_test.values[0:25])
    print('Predicted:', y_pred[0:25])
    
    # Confusion matrix 
    # save confusion matrix and slice into True Negative, False Positive, False Negative, True Positive
    tn, fp, fn, tp = metrics.confusion_matrix(y_test,y_pred).ravel()
    
    # Visualize Confusion Matrix
    sns.heatmap(confusion, annot=True, fmt='d')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    # Metrics Computed from a Confusion Matrix
    
    # Classification Accuracy: How often was the classifier correct, overall?
    print('Classification Accuracy:', class_accuracy)
    
    # Classification Error: How often was the classifier incorrect, overall?
    print('Classification Error:', 1-class_accuracy)
    
    # False Positive Rate
    fp_rate = fp / float(tn + fp)
    
    # Precision: When a positive value is predicted, how often is the prediction correct?
    print('Precision:', metrics.precision_score(y_test, y_pred))
    
    ###########################################
    # ROC Curves and Area Under the Curve (AUC)
    ###########################################
    
    # Question: Wouldn't it be nice if we could see how sensitivity and specificity 
    # are affected by various thresholds, without actually changing the threshold?
    # Answer: Plot the ROC curve!
    
    # AUC score - predictions is second argument, NOT predicted probabilties -- still trying to figure this stuff out
    print('AUC Score', metrics.roc_auc_score(y_test, y_pred))
    
    # calculate cross-validated AUC 
    print('Cross-validated AUC:', cross_val_score(model, X, y, cv=5, scoring='roc_auc').mean())
    
    # get predicted PROBABILITIES for treatment = 1
    y_prob = model.predict_proba(X_test)[:, 1]

    
    # AUC is the percentage of the ROC plot that is underneath the curve
    # Higher value = better classifier 
    roc_auc = metrics.roc_auc_score(y_test, y_prob)
    
    # First argument is true values, second arg is predicted PROBabilties
    # we do not use y_pred bc will incorrect results without throwing an error
    # roc_curve returns: false positive rate, true positive rate, thresholds
    
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
    if plot==True:
        plt.figure()
        
        plt.plot(fpr,tpr, color="green", label="ROC curve (area= %0.2f)", % roc_auc)
        plt.plot([0,1],[0,1], color='gray', linestyle='--')
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.legend(loc="lower right")
        plt.show()
        
        # define a function that accepts a threshholds and prints sensitivity and specificity
        
        def eval_threshold(threshold):
            # Sensitivity: When the actual value is positive, how often is the prediction correct?
            # Specificity: When the actual value is negative, how often is the prediction correct?
            
            print('Specificty for ' + str(threshold) + ':', 1-fpr[thresholds>threshold][-1])
            
            # One way of setting threshold
            predict_mine = np.where(y_prob > 0.50, 1, 0)
            confusion = metrics.confusion_matrix(y_test, predict_mine)
            print(confusion)
            
            return accuracy
    

?? how do the results intersect with that big group of people I identified with UpSetR? If I split it into that group and not that group, how does the model(s) hold up?