## Importing:

In [1]:
import pandas as pd
import numpy as np
import gc
pd.set_option('display.max_rows', 200)

In [2]:
col_list = ['GenderSelect', 'Country', 'Age', 'EmploymentStatus', 'StudentStatus', 'LearningDataScience', 
            'CodeWriter', 'CareerSwitcher', 'CurrentEmployerType', 'MLToolNextYearSelect', 'MLMethodNextYearSelect',
            'LanguageRecommendationSelect', 'PublicDatasetsSelect', 'BlogsPodcastsNewslettersSelect', 'LearningDataScienceTime', 'JobSkillImportanceBigData', 
            'JobSkillImportanceDegree', 'JobSkillImportanceStats', 'JobSkillImportanceEnterpriseTools', 
            'JobSkillImportancePython', 'JobSkillImportanceR', 'JobSkillImportanceSQL', 'JobSkillImportanceKaggleRanking',
            'JobSkillImportanceMOOC', 'JobSkillImportanceVisualizations', 'ProveKnowledgeSelect', 'FormalEducation', 
            'MajorSelect', 'Tenure', 'PastJobTitlesSelect', 'FirstTrainingSelect', 'LearningCategorySelftTaught', 
            'LearningCategoryOnlineCourses', 'LearningCategoryWork', 'LearningCategoryUniversity', 'LearningCategoryKaggle', 
            'LearningCategoryOther', 'MLSkillsSelect', 'MLTechniquesSelect', 'JobHuntTime']

df = pd.read_csv('Kaggle_Survey.csv', encoding = 'latin-1', usecols = col_list)

In [3]:
# Selecting data where there is a non-null entry for job-time:
   # Missing not at random: (review)
jobtime_mask = df['JobHuntTime'].notnull()
job_time = df[jobtime_mask]

In [4]:
del df, jobtime_mask, col_list
gc.collect()

5

### Cleaning/Exploratory Analysis:

#### Descriptive Stats:

In [5]:
pd.set_option('display.max_columns', 45)
descr = job_time.describe(include = 'all')

In [6]:
# Looking at nulls (did drop some columns that were all-nulls from our read in line above)
    # Also dropped LearningPlatformSelect since it had a significant amount of unique values 
    # and would have required significant cleaning. 

info_df = pd.DataFrame(job_time.isna().sum())
info_df['unique_vals'] = descr.iloc[1]
info_df['top_val'] = descr.iloc[2]
info_df['top_freq'] = descr.iloc[3]
info_df['dtype'] = job_time.dtypes

# To describe:
    # display info_df dataframe.
    # display descr stats
    # Display ['StudentStatus:CurrentEmployerType'] and describe why you are dropping them
    # Dropping columns with nullcounts above 20% (i.e. 740)
        # Tenur / learning categories
    # Filled Blogs/Media with nothing listed - (for keepings sake)
    # Highest Null count after this was 440 - so we imputed the mode here.
    # highest null count after that is relatively low - so dropping just before modeling should be safe.
    # Display output feature and discuss

In [7]:
job_time.drop(['EmploymentStatus', 'StudentStatus', 
               'LearningDataScience', 'CodeWriter',
               'CareerSwitcher', 'CurrentEmployerType',
               'Tenure', 'LearningCategorySelftTaught',
               'LearningCategoryOnlineCourses',  
               'LearningCategoryWork', 'LearningCategoryUniversity',
               'LearningCategoryKaggle', 'LearningCategoryOther'], axis = 1, inplace = True)

In [42]:
job_time['BlogsPodcastsNewslettersSelect'].fillna('No Info/No Answer Given', inplace = True) 


In [24]:
col_list = ['MLToolNextYearSelect', 'MLMethodNextYearSelect', 
            'LanguageRecommendationSelect', 'PublicDatasetsSelect', 
            'JobSkillImportanceBigData', 'JobSkillImportanceDegree', 
            'JobSkillImportanceStats','JobSkillImportanceEnterpriseTools', 
            'JobSkillImportancePython','JobSkillImportanceR', 
            'JobSkillImportanceSQL', 'JobSkillImportanceKaggleRanking', 
            'JobSkillImportanceMOOC','JobSkillImportanceVisualizations', 
            'ProveKnowledgeSelect','MajorSelect', 'PastJobTitlesSelect',
            'MLSkillsSelect', 'MLTechniquesSelect']

for x in col_list:
    
    alpha = job_time[x].value_counts().index[0]
    job_time[x].replace(np.nan, alpha, inplace = True)

In [25]:
job_time.isna().sum()

GenderSelect                          8
Country                               8
Age                                  32
MLToolNextYearSelect                  0
MLMethodNextYearSelect                0
LanguageRecommendationSelect          0
PublicDatasetsSelect                  0
BlogsPodcastsNewslettersSelect        0
LearningDataScienceTime               2
JobSkillImportanceBigData             0
JobSkillImportanceDegree              0
JobSkillImportanceStats               0
JobSkillImportanceEnterpriseTools     0
JobSkillImportancePython              0
JobSkillImportanceR                   0
JobSkillImportanceSQL                 0
JobSkillImportanceKaggleRanking       0
JobSkillImportanceMOOC                0
JobSkillImportanceVisualizations      0
ProveKnowledgeSelect                  0
FormalEducation                      10
MajorSelect                           0
PastJobTitlesSelect                   0
FirstTrainingSelect                   5
MLSkillsSelect                        0


In [21]:
# Mode:!!!  can do fillna with this.  (no need for a lambda func)
job_time.JobHuntTime.value_counts().values[0]

1500

#### Pre-Dummy Prep:
> Mainly to prevent an overly large feature-set in our results and maintain as much human-readability as possible.  Took the columns with the most unique values and cut them down.

In [26]:
job_time_prepped = pd.DataFrame()

In [27]:
import re

# BlogsPodcastsNewslettersSelect:
job_time_prepped['KDNuggets'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(KDnuggets.*)', flags = re.IGNORECASE)
job_time_prepped['RBloggers'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(R Bloggers.*)', flags = re.IGNORECASE)
job_time_prepped['SirajRaval'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(Siraj Raval.*)', flags = re.IGNORECASE)
job_time_prepped['OReilly'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(O\'Reilly Data.*)', flags = re.IGNORECASE)
job_time_prepped['NoFreeHunch'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(No Free Hunch.*)', flags = re.IGNORECASE)
job_time_prepped['BecomingDS'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(Becoming.*)', flags = re.IGNORECASE)
job_time_prepped['StatisticalModeling'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(Statistical Modeling.*)', flags = re.IGNORECASE)
job_time_prepped['FastML'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(FastML)', flags = re.IGNORECASE)
job_time_prepped['DataMachina'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(Data Machina.*)', flags = re.IGNORECASE)
job_time_prepped['DataSkeptic'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(The Data Skeptic.*)', flags = re.IGNORECASE)
job_time_prepped['LinDigressions'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(Linear Digressions.*)', flags = re.IGNORECASE)
job_time_prepped['DataElixir'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(Data Elixir.*)', flags = re.IGNORECASE)
job_time_prepped['FlowingData'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(FlowingData.*)', flags = re.IGNORECASE)
job_time_prepped['PartiallyDerivative'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(Partially Derivative.*)', flags = re.IGNORECASE)
job_time_prepped['JacksImport'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(Jack\'s Import.*)', flags = re.IGNORECASE)
job_time_prepped['AnalyticsDispatch'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(The Analytics Dispatch.*)', flags = re.IGNORECASE)
job_time_prepped['DataStories'] = job_time['BlogsPodcastsNewslettersSelect'].str.contains(r'(Data Stories.*)', flags = re.IGNORECASE)



  after removing the cwd from sys.path.
  """
  
  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]
  
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [28]:
# PublicDataSetsSelect:
job_time_prepped['Aggregator'] = job_time['PublicDatasetsSelect'].str.contains(r'(Dataset aggregator.*)', flags = re.IGNORECASE)
job_time_prepped['SelfCollectData'] = job_time['PublicDatasetsSelect'].str.contains(r'(I collect.*)', flags = re.IGNORECASE)
job_time_prepped['GoogleSearch'] = job_time['PublicDatasetsSelect'].str.contains(r'(Google Search)', flags = re.IGNORECASE)
job_time_prepped['University'] = job_time['PublicDatasetsSelect'].str.contains(r'(University.*)', flags = re.IGNORECASE)
job_time_prepped['Govt'] = job_time['PublicDatasetsSelect'].str.contains(r'(Government.*)', flags = re.IGNORECASE)
job_time_prepped['OtherDataCollect'] = job_time['PublicDatasetsSelect'].str.contains(r'(Other)', flags = re.IGNORECASE)

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys


In [29]:
# MLTechniquesSelect:
job_time_prepped['Bayesian'] = job_time['MLTechniquesSelect'].str.contains(r'(Bayesian Techniques.*)', flags = re.IGNORECASE)
job_time_prepped['DT_GBC'] = job_time['MLTechniquesSelect'].str.contains(r'(Decision Trees - Gradient.*)', flags = re.IGNORECASE)
job_time_prepped['DT_RF'] = job_time['MLTechniquesSelect'].str.contains(r'(Decision Trees - Random.*)', flags = re.IGNORECASE)
job_time_prepped['Ensembles'] = job_time['MLTechniquesSelect'].str.contains(r'(Ensemble Methods)', flags = re.IGNORECASE)
job_time_prepped['LogisticReg'] = job_time['MLTechniquesSelect'].str.contains(r'(Logistic Regression)', flags = re.IGNORECASE)
job_time_prepped['MarkovNetworks'] = job_time['MLTechniquesSelect'].str.contains(r'(Markov Logic Networks)', flags = re.IGNORECASE)
job_time_prepped['Neural_CNN'] = job_time['MLTechniquesSelect'].str.contains(r'(Neural Networks - CNNs)', flags = re.IGNORECASE)
job_time_prepped['Neural_RNN'] = job_time['MLTechniquesSelect'].str.contains(r'(Neural Networks - RNNs)', flags = re.IGNORECASE)
job_time_prepped['SVM'] = job_time['MLTechniquesSelect'].str.contains(r'(Support Vector Machines (SVMs))', flags = re.IGNORECASE)
job_time_prepped['HiddenMarkov'] = job_time['MLTechniquesSelect'].str.contains(r'(Hidden Markov Models HMMs)', flags = re.IGNORECASE)
job_time_prepped['OtherMLTechnique'] = job_time['MLTechniquesSelect'].str.contains(r'(Other)', flags = re.IGNORECASE)
job_time_prepped['Neural_GAN'] = job_time['MLTechniquesSelect'].str.contains(r'(Neural Networks - GANs)', flags = re.IGNORECASE)
job_time_prepped['Evolutionary'] = job_time['MLTechniquesSelect'].str.contains(r'(Evolutionary Approaches)', flags = re.IGNORECASE)



  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]
  


In [30]:
# MLSkillsSelect:
job_time_prepped['CVision'] = job_time['MLSkillsSelect'].str.contains(r'(Computer Vision.*)', flags = re.IGNORECASE)
job_time_prepped['NLP'] = job_time['MLSkillsSelect'].str.contains(r'(Natural Language.*)', flags = re.IGNORECASE)
job_time_prepped['SurAn'] = job_time['MLSkillsSelect'].str.contains(r'(Survival Analysis.*)', flags = re.IGNORECASE)
job_time_prepped['OutDetect'] = job_time['MLSkillsSelect'].str.contains(r'(Outlier detection.*)', flags = re.IGNORECASE)
job_time_prepped['ReccEngines'] = job_time['MLSkillsSelect'].str.contains(r'(Reccomendation Engines.*)', flags = re.IGNORECASE)
job_time_prepped['SupML'] = job_time['MLSkillsSelect'].str.contains(r'(Supervised Machine Learning.*)', flags = re.IGNORECASE)
job_time_prepped['UnsupML'] = job_time['MLSkillsSelect'].str.contains(r'(Unsupervised Machine Learning.*)', flags = re.IGNORECASE)
job_time_prepped['TimeSeries'] = job_time['MLSkillsSelect'].str.contains(r'(Time Series.*)', flags = re.IGNORECASE)
job_time_prepped['ReinforceL'] = job_time['MLSkillsSelect'].str.contains(r'(Reinforcement Learning.*)', flags = re.IGNORECASE)
job_time_prepped['AdvL'] = job_time['MLSkillsSelect'].str.contains(r'(Adversarial Learning.*)', flags = re.IGNORECASE)
job_time_prepped['MachineTrans'] = job_time['MLSkillsSelect'].str.contains(r'(Machine Translation.*)', flags = re.IGNORECASE)
job_time_prepped['SpeechRec'] = job_time['MLSkillsSelect'].str.contains(r'(Speech Recognition.*)', flags = re.IGNORECASE)



  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]


In [31]:
job_time2Y = job_time.drop(['BlogsPodcastsNewslettersSelect', 'PublicDatasetsSelect', 
                           'MLTechniquesSelect', 'MLSkillsSelect'], axis = 1)

In [32]:
job_time2Y = job_time2Y.join(job_time_prepped, on = job_time_prepped.index)

#### Some Visuals:

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.scatterplot(x = 'Age', y = job_time2Y.index, hue = 'GenderSelect', data = job_time2Y)

In [None]:
plt.hist(job_time2Y['Age'], range = [-1, 110])

### Pre-Liminary Models

#### Getting Dummies / Train, Test, Split, etc.

In [33]:
job_time2 = job_time2Y.drop('JobHuntTime', axis = 1)

In [36]:
from sklearn.model_selection import train_test_split

X = job_time2.dropna(how = 'any')
Y = job_time2Y['JobHuntTime'].loc[X.index]

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .25)

In [38]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [39]:
from sklearn.neural_network import MLPClassifier

In [40]:
MLP1 = MLPClassifier(hidden_layer_sizes=[100, 120, 130])

In [41]:
MLP1.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=[100, 120, 130], learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)