In [0]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

In [3]:
# Upload your kaggle.json API information here (you can get this from my GitHub folder, just replace your username and key)

from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"thedbguy","key":"3dca6b34f878a254d7e6213655480010"}'}

In [4]:
!cp kaggle.json ~/.kaggle/
!kaggle datasets download -d stackoverflow/stack-overflow-2018-developer-survey


Downloading stack-overflow-2018-developer-survey.zip to /content
 46% 9.00M/19.6M [00:00<00:00, 48.0MB/s]
100% 19.6M/19.6M [00:00<00:00, 73.8MB/s]


In [6]:
# There are a number of multi-value columns that need to be separated into their own columns. For instance, "LanguagesWorkedWith" might contain the following
# values: JavaScript;Python;HTML;CSS
#
# We need to split these into their own columns so that we can analyze them.
import numpy as np
import pandas as pd
from zipfile import ZipFile

# Load the data
zip_file = ZipFile('/content/stack-overflow-2018-developer-survey.zip')
data=pd.read_csv(zip_file.open('survey_results_public.csv'))
data.head()

# Removed features with very high NA's
categorical_features = ['LanguageWorkedWith', 'LanguageDesireNextYear', 
                        'IDE', 'VersionControl', 'Hobby', 'OpenSource', 'Student', 'FormalEducation', 'UndergradMajor', 
                        'CompanySize', 'DevType', 'YearsCoding',
                        'YearsCodingProf', 'LastNewJob', 'CommunicationTools', 'TimeFullyProductive', 
                        'EducationTypes', 'AgreeDisagree1',	'AgreeDisagree2', 'AgreeDisagree3',
                        'OperatingSystem','NumberMonitors', 'CheckInCode', 'AIDangerous', 'AIFuture', 
                        'EthicsChoice', 'EthicsReport', 'EthicalImplications', 
                        'HypotheticalTools1', 'HypotheticalTools2', 'HypotheticalTools3', 'HypotheticalTools4', 
                        'HypotheticalTools5', 'WakeTime', 'HoursComputer', 'HoursOutside', 'SkipMeals', 'Exercise', 
                        'Age', 'SurveyTooLong', 'SurveyEasy', 'Employment', 'HopeFiveYears', 
                        'Gender']

# Filter Data
data = data[(data['JobSatisfaction'] == 'Extremely satisfied') | (data['JobSatisfaction'] == 'Extremely dissatisfied') | (data['JobSatisfaction'] == 'Moderately dissatisfied')]

# Add Target Variable
targets = ['Extremely satisfied']
data['JobSatisfactionTarget'] = np.where(np.in1d(data['JobSatisfaction'], targets), 1, 0)    

# Encode categorical features so that they're numeric.
for feature in categorical_features: 
    exploded = data[feature].str.get_dummies(sep=';').rename(lambda x: feature + '_' + x, axis='columns') 
    data = pd.concat([data, exploded], axis = 1)
    
numeric_features = ['ConvertedSalary', 'AssessJob1', 'AssessJob2', 'AssessJob3', 'AssessJob4', 'AssessJob5', 'AssessJob6', 'AssessJob7', 'AssessJob8', 
                    'AssessJob9', 'AssessJob10',	'AssessBenefits1',	'AssessBenefits2', 'AssessBenefits3', 'AssessBenefits4', 'AssessBenefits5',	 
                    'AssessBenefits6', 'AssessBenefits7',	'AssessBenefits8', 'AssessBenefits9',	 'AssessBenefits10', 'AssessBenefits11']

encoded_features = []

for column in data:
    for feature in categorical_features:
      if (feature in column):
        encoded_features.append(column)
        break

encoded_features = [x for x in encoded_features if x not in categorical_features]   

# These are the features with the best correlation (positive or negative to target variable)
encoded_features = [
'HopeFiveYears_Doing the same work',
"HopeFiveYears_Working in a different or more specialized technical role than the one I'm in now",
'HypotheticalTools4_Not at all interested',
'HopeFiveYears_Working in a career completely unrelated to software development',
'CommunicationTools_Slack',
'CheckInCode_Multiple times per day',
'EducationTypes_Contributed to open source software',
'AgreeDisagree1_Strongly agree',
'DevType_C-suite executive (CEO, CTO, etc.)',
'HypotheticalTools5_Not at all interested',
'HypotheticalTools4_Extremely interested',
'LastNewJob_Less than a year ago',
'AgreeDisagree1_Disagree',
'OpenSource_No',
'OpenSource_Yes',
'HypotheticalTools5_Extremely interested',
'Employment_Independent contractor, freelancer, or self-employed',
'AgreeDisagree1_Neither Agree nor Disagree',
'EducationTypes_Received on-the-job training in software development',
'OperatingSystem_MacOS'
]

all_features = numeric_features + encoded_features        

# Drop features that are not being used
for column in data.columns:
  if (column in all_features or column == 'JobSatisfactionTarget'):
    continue
  
  data.drop(column, axis=1, inplace=True)
  
# Create Train Test Set with Target Variable Transformation
from sklearn.model_selection import train_test_split

X = data.drop('JobSatisfactionTarget', axis=1)
y = data['JobSatisfactionTarget']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
# Load Saved Model (stackoverflow.pkl in the GitHub repo)
import io
import pickle

from google.colab import files
uploaded = files.upload()

model = pickle.load(io.BytesIO(uploaded['stackoverflow.pkl']))

Saving stackoverflow.pkl to stackoverflow.pkl


In [13]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Confusion Matrix to see how well we did.
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)

# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)

# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)

[[ 773  982]
 [ 439 2054]]
Accuracy: 0.665490
Precision: 0.676548
Recall: 0.823907
