# Models and Evaluation

In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import randint
from sklearn.preprocessing import binarize
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
 
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, precision_recall_curve, classification_report
from sklearn.model_selection import cross_val_score


from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import *

import warnings
warnings.filterwarnings("ignore")

  from numpy.core.umath_tests import inner1d
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ._gradient_boosting import predict_stages
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ._gradient_boosting import predict_stages
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-not

In [2]:
df=pd.read_csv('../datasets/final.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,Are you self-employed?,How many employees does your company or organization have?,Is your employer primarily a tech company/organization?,Is your primary role within your company related to tech/IT?,Does your employer provide mental health benefits as part of healthcare coverage?,Do you know the options for mental health care available under your employer-provided health coverage?,"Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?",Does your employer offer resources to learn more about mental health disorders and options for seeking help?,...,Are you openly identified at work as a person with a mental health issue?,"If they knew you suffered from a mental health disorder, how do you think that your team members/co-workers would react?",Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?,Have you observed or experienced a supportive or well handled response to a mental health issue in your current or previous workplace?,"Overall, how well do you think the tech industry supports employees with mental health issues?",What is your age?,What is your gender?,What country do you live in?,What is your race?,What country do you work in?
0,0,1,0,4,1,1,0,1,0,2,...,0.0,10.0,2,2,1.0,27.0,0,50,3,52
1,1,1,0,4,1,1,1,1,0,0,...,0.0,6.0,3,0,2.0,31.0,3,50,3,52
2,2,1,0,2,1,1,2,0,2,0,...,1.0,5.0,2,2,1.0,36.0,3,51,3,53
3,3,1,0,4,1,0,1,0,0,2,...,0.0,4.0,3,3,2.0,30.0,3,51,3,53
4,4,1,0,2,1,1,1,1,0,0,...,1.0,5.0,1,3,2.0,36.0,0,51,2,53


In [4]:
#Scaling this column to between 0 and 1
df['How many employees does your company or organization have?'] =(df['How many employees does your company or organization have?'] - min(df['How many employees does your company or organization have?'])) / (max(df['How many employees does your company or organization have?']) - min(df['How many employees does your company or organization have?']))

## Features and dependent variables

In [5]:
X = df.drop(columns=['Would you feel comfortable discussing a mental health issue with your coworkers?','Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?'], axis =1)
X = df.drop(columns=['Unnamed: 0'])#,'How many employees does your company or organization have?'],axis=1)#,'Is your employer primarily a tech company/organization?','Is your primary role within your company related to tech/IT?' ])
Y1=df['Would you feel comfortable discussing a mental health issue with your coworkers?']
Y2=df['Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?']
print('y1 :',Y1.value_counts())
print('y2 :',Y2.value_counts())

y1 : 2    405
1    320
0    194
Name: Would you feel comfortable discussing a mental health issue with your coworkers?, dtype: int64
y2 : 1    363
2    310
0    246
Name: Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?, dtype: int64


## Splitting the data into training and test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y1_train, y1_test = train_test_split(X, Y1, test_size = 0.2, random_state = 25)
X_train, X_test, y2_train, y2_test = train_test_split(X, Y2, test_size = 0.2, random_state = 25)

## Implementing Machine Learning Models


### 1.Logistic Regression


In [7]:
pipe1 = make_pipeline(StandardScaler(), LogisticRegression())
pipe1.fit(X_train, y1_train)  
print('Score for y1:',pipe1.score(X_test, y1_test))
print('Cross validation score for logistic regression to classify y1:',cross_val_score(pipe1, X, Y1, cv=10))
pipe1.fit(X_train, y2_train) 
print('Score for y2:',pipe1.score(X_test, y2_test))
print('Cross validation score for logistic regression to classify y2:',cross_val_score(pipe1, X, Y2, cv=10))

Score for y1: 0.9293478260869565
Cross validation score for logistic regression to classify y1: [0.94623656 0.90322581 0.95698925 0.96774194 1.         0.98901099
 0.98901099 0.98901099 0.97802198 0.89010989]
Score for y2: 0.9782608695652174
Cross validation score for logistic regression to classify y2: [0.96774194 0.98924731 0.98924731 0.98913043 0.94565217 0.98913043
 0.98901099 0.98901099 0.95604396 0.94505495]


### 2.SVM

In [8]:
pipe2 = make_pipeline(StandardScaler(), SVC(kernel = 'linear'))
pipe2.fit(X_train, y1_train)  
print('Score for y1:',pipe2.score(X_test, y1_test))
print('Cross validation score for SVM to classify y1:',cross_val_score(pipe2, X, Y1, cv=10))
pipe2.fit(X_train, y2_train) 
print('Score for y2:',pipe2.score(X_test, y2_test))
print('Cross validation score for SVM to classify y2:',cross_val_score(pipe2, X, Y2, cv=10))


Score for y1: 1.0
Cross validation score for SVM to classify y1: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Score for y2: 1.0
Cross validation score for SVM to classify y2: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


### 3.K Nearest Neighbors classifier

In [9]:
pipe3 = make_pipeline(StandardScaler(),KNeighborsClassifier(n_neighbors=3))
pipe3.fit(X_train, y1_train)  
print('Score for y1:',pipe3.score(X_test, y1_test))
print('Cross validation score for KNN to classify y1:',cross_val_score(pipe3, X, Y1, cv=10))
pipe3.fit(X_train, y2_train) 
print('Score for y2:',pipe3.score(X_test, y2_test))
print('Cross validation score for KNN to classify y2:',cross_val_score(pipe3, X, Y2, cv=10))

Score for y1: 0.6956521739130435
Cross validation score for KNN to classify y1: [0.64516129 0.68817204 0.76344086 0.66666667 0.76086957 0.67032967
 0.76923077 0.71428571 0.73626374 0.59340659]
Score for y2: 0.6684782608695652
Cross validation score for KNN to classify y2: [0.62365591 0.60215054 0.67741935 0.67391304 0.61956522 0.65217391
 0.69230769 0.63736264 0.67032967 0.61538462]


### 4.Decision Tree


In [10]:
pipe4 = make_pipeline(StandardScaler(),DecisionTreeClassifier(random_state=10))
pipe4.fit(X_train, y1_train)  
print('Score for y1:',pipe4.score(X_test, y1_test))
print('Cross validation score for Decision Tree to classify y1:',cross_val_score(pipe4, X, Y1, cv=10))
pipe4.fit(X_train, y2_train) 
print('Score for y2:',pipe4.score(X_test, y2_test))
print('Cross validation score for Decision Tree to classify y2:',cross_val_score(pipe4, X, Y2, cv=10))

Score for y1: 1.0
Cross validation score for Decision Tree to classify y1: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Score for y2: 1.0
Cross validation score for Decision Tree to classify y2: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


### 5.Random Forest Classifier

In [11]:
pipe5 = make_pipeline(StandardScaler(),RandomForestClassifier(max_depth=6,random_state=10))
pipe5.fit(X_train, y1_train)  
print('Score for y1:',pipe5.score(X_test, y1_test))
print('Cross validation score for Random Forest classifier to classify y1:',cross_val_score(pipe5, X, Y1, cv=10))
pipe5.fit(X_train, y2_train) 
print('Score for y2:',pipe5.score(X_test, y2_test))
print('Cross validation score for Random Forest  Classifier to classify y2:',cross_val_score(pipe5, X, Y2, cv=10))

Score for y1: 0.9728260869565217
Cross validation score for Random Forest classifier to classify y1: [0.93548387 0.89247312 0.94623656 0.97849462 0.91304348 0.97802198
 0.97802198 0.98901099 0.97802198 0.89010989]
Score for y2: 0.967391304347826
Cross validation score for Random Forest  Classifier to classify y2: [0.97849462 0.97849462 0.95698925 0.93478261 0.9673913  0.94565217
 1.         0.93406593 0.94505495 0.95604396]


### 6.GridSearch

In [12]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
pipe6 = make_pipeline(StandardScaler(),GridSearchCV(svc, parameters))
pipe6.fit(X_train, y1_train)  
print('Score for y1:',pipe6.score(X_test, y1_test))
print('Cross validation score for Grid Search to classify y1:',cross_val_score(pipe6, X, Y1, cv=10))
pipe6.fit(X_train, y2_train) 
print('Score for y2:',pipe6.score(X_test, y2_test))
print('Cross validation score for Grid Search to classify y2:',cross_val_score(pipe6, X, Y2, cv=10))

Score for y1: 1.0
Cross validation score for Grid Search to classify y1: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Score for y2: 1.0
Cross validation score for Grid Search to classify y2: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


### 7.BaggingClassifier

In [13]:
'''from sklearn.model_selection import train_test_split
X_train, X_test, y1_train, y1_test = train_test_split(X, Y1, test_size = 0.2, random_state = 25)
X_train, X_test, y2_train, y2_test = train_test_split(X, Y2, test_size = 0.2, random_state = 25)
clf = BaggingClassifier(base_estimator=SVC(),n_estimators=10, random_state=0).fit(X_train, y1_train)
clf.score(X_test,y1_test)
'''

'from sklearn.model_selection import train_test_split\nX_train, X_test, y1_train, y1_test = train_test_split(X, Y1, test_size = 0.2, random_state = 25)\nX_train, X_test, y2_train, y2_test = train_test_split(X, Y2, test_size = 0.2, random_state = 25)\nclf = BaggingClassifier(base_estimator=SVC(),n_estimators=10, random_state=0).fit(X_train, y1_train)\nclf.score(X_test,y1_test)\n'

In [14]:
pipe7 = make_pipeline(StandardScaler(),BaggingClassifier(base_estimator=SVC(),n_estimators=4))
pipe7.fit(X_train, y1_train)  
print('Score for y1:',pipe7.score(X_test, y1_test))
print('Cross validation score for Bagging Classifier to classify y1:',cross_val_score(pipe7, X, Y1, cv=10))
pipe7.fit(X_train, y2_train) 
print('Score for y2:',pipe7.score(X_test, y2_test))
print('Cross validation score for Bagging Classifier to classify y2:',cross_val_score(pipe7, X, Y2, cv=10))

Score for y1: 0.9510869565217391
Cross validation score for Bagging Classifier to classify y1: [0.94623656 0.94623656 0.97849462 1.         1.         0.98901099
 0.98901099 1.         1.         0.95604396]
Score for y2: 0.967391304347826
Cross validation score for Bagging Classifier to classify y2: [0.97849462 0.98924731 0.97849462 0.97826087 0.9673913  1.
 0.97802198 0.97802198 0.96703297 0.95604396]


### 8.AdaBoostClassifier

In [15]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y1_train)
clf.score(X_test,y1_test)

1.0

In [16]:
pipe8 = make_pipeline(StandardScaler(),AdaBoostClassifier(n_estimators=150, random_state=0))
pipe8.fit(X_train, y1_train)  
print('Score for y1:',pipe8.score(X_test, y1_test))
print('Cross validation score for AdaBoost classifier to classify y1:',cross_val_score(pipe8, X, Y1, cv=10))
pipe8.fit(X_train, y2_train) 
print('Score for y2:',pipe8.score(X_test, y2_test))
print('Cross validation score for AdaBoost classifier to classify y2:',cross_val_score(pipe8, X, Y2, cv=10))

Score for y1: 1.0
Cross validation score for AdaBoost classifier to classify y1: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Score for y2: 1.0
Cross validation score for AdaBoost classifier to classify y2: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
