# Can I build a predictive model?
### By: Sonia Rowley & Athina Schmidt

In [1]:
# Import Fundamental Pacakges
import numpy as np
import pandas as pd

# Import Packages for Machine Learning Algorithms
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Import Library for Statistical & Regular Functions
from statistics import mode
import re

# This Package imports our classifer
from xgboost import XGBClassifier

In [2]:
# Original data source: https://www.foreignlaborcert.doleta.gov/performancedata.cfm 


In [3]:
# Read in File
df = pd.read_csv('2018.csv', encoding = 'latin1')

In [4]:
df.info()
df. head()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 609036 entries, 0 to 609035
Data columns (total 15 columns):
CASE_NUMBER              609036 non-null object
CASE_STATUS              609036 non-null object
CASE_SUBMITTED           609035 non-null object
DECISION_DATE            609036 non-null object
VISA_CLASS               609036 non-null object
EMPLOYMENT_START_DATE    609029 non-null object
EMPLOYMENT_END_DATE      609027 non-null object
EMPLOYER_NAME            609017 non-null object
EMPLOYER_STATE           608967 non-null object
EMPLOYER_POSTAL_CODE     609020 non-null object
JOB_TITLE                609031 non-null object
SOC_CODE                 609032 non-null object
SOC_NAME                 609032 non-null object
NAICS_CODE               609030 non-null float64
PREVAILING_WAGE          609035 non-null float64
dtypes: float64(2), object(13)
memory usage: 69.7+ MB


Unnamed: 0,NAICS_CODE,PREVAILING_WAGE
count,609030.0,609035.0
mean,443514.3775,88118.67
std,195499.314088,1281586.0
min,23.0,0.0
25%,452910.0,68702.0
50%,541511.0,82326.0
75%,541511.0,99341.0
max,928120.0,1000000000.0


In [5]:
df['CASE_STATUS'].unique()

array(['CERTIFIED', 'DENIED', 'WITHDRAWN', 'CERTIFIED-WITHDRAWN'],
      dtype=object)

In [6]:
# Convert from ultiple classes of target variables to a binary system.
# Certified + Certified-Withdrawn = CERTIFIED

In [7]:
import warnings
warnings.filterwarnings("ignore")
df.CASE_STATUS[df['CASE_STATUS']=='CERTIFIED-WITHDRAWN'] = 'CERTIFIED'

In [8]:
##Drop rows with withdrawn
df.EMPLOYER_NAME.describe()
df = df.drop(df[df.CASE_STATUS == 'WITHDRAWN'].index)

## Storing non null in df w.r.t. case status
df = df[df['CASE_STATUS'].notnull()]
print(df['CASE_STATUS'].value_counts())

CERTIFIED    582638
DENIED         6981
Name: CASE_STATUS, dtype: int64


In [9]:
print(26398/(26398+582638))

0.043343907420907796


In [10]:
# Check count of NAN
count_nan = len(df) - df.count()
print(count_nan)

CASE_NUMBER               0
CASE_STATUS               0
CASE_SUBMITTED            1
DECISION_DATE             0
VISA_CLASS                0
EMPLOYMENT_START_DATE     5
EMPLOYMENT_END_DATE       6
EMPLOYER_NAME            15
EMPLOYER_STATE           59
EMPLOYER_POSTAL_CODE     14
JOB_TITLE                 2
SOC_CODE                  2
SOC_NAME                  2
NAICS_CODE                4
PREVAILING_WAGE           1
dtype: int64


In [11]:
## Filling na in employer name with mode
df['EMPLOYER_NAME'] = df['EMPLOYER_NAME'].fillna(df['EMPLOYER_NAME'].mode()[0])

In [12]:
# Fixing other N/A Values as above
df = df.drop('CASE_SUBMITTED', axis = 1)
df = df.drop('EMPLOYMENT_START_DATE', axis = 1)
df = df.drop('EMPLOYMENT_END_DATE', axis = 1)
df = df.drop('EMPLOYER_POSTAL_CODE', axis = 1)
df = df.drop('NAICS_CODE', axis = 1)


In [13]:
# Fixing 49 missing values for state
df['EMPLOYER_STATE'] = df['EMPLOYER_STATE'].fillna(df['EMPLOYER_STATE'].mode()[0])

In [14]:
assert pd.notnull(df['EMPLOYER_NAME']).all().all()

In [15]:
##to check the percentile in wages (Caps wages at 2nd and 98th percentile to get rid of $0 and positions entered)
print(np.nanpercentile(df.PREVAILING_WAGE,98))
df.PREVAILING_WAGE.median()

160576.0


82410.0

In [16]:
## replacing min and max with 2 and 98 percentile
df.loc[df.PREVAILING_WAGE < 34029, 'PREVAILING_WAGE']= 34029
df.loc[df['PREVAILING_WAGE'] > 138703, 'PREVAILING_WAGE']= 138703
df.PREVAILING_WAGE.fillna(df.PREVAILING_WAGE.mean(), inplace = True)

In [17]:
## Filling na in JOB_TITLE and FULL_TIME_POSITION with mode
df['JOB_TITLE'] = df['JOB_TITLE'].fillna(df['JOB_TITLE'].mode()[0])
df['SOC_NAME'] = df['SOC_NAME'].fillna(df['SOC_NAME'].mode()[0])
df['SOC_CODE'] = df['SOC_CODE'].fillna(df['SOC_CODE'].mode()[0])

In [18]:
df['NEW_EMPLOYER'] = np.nan
df.shape

(589619, 11)

In [19]:
warnings.filterwarnings("ignore")

df['EMPLOYER_NAME'] = df['EMPLOYER_NAME'].str.lower()
df.NEW_EMPLOYER[df['EMPLOYER_NAME'].str.contains('university')] = 'university'
df['NEW_EMPLOYER']= df.NEW_EMPLOYER.replace(np.nan, 'non university', regex=True)

In [20]:
# Creating occupation and mapping the values
warnings.filterwarnings("ignore")

df['OCCUPATION'] = np.nan
df['SOC_NAME'] = df['SOC_NAME'].str.lower()
df.OCCUPATION[df['SOC_NAME'].str.contains('computer','programmer')] = 'computer occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('software','web developer')] = 'computer occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('database')] = 'computer occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('math','statistic')] = 'Mathematical Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('predictive model','stats')] = 'Mathematical Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('teacher','linguist')] = 'Education Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('professor','Teach')] = 'Education Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('school principal')] = 'Education Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('medical','doctor')] = 'Medical Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('physician','dentist')] = 'Medical Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('Health','Physical Therapists')] = 'Medical Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('surgeon','nurse')] = 'Medical Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('psychiatr')] = 'Medical Occupations'
df.OCCUPATION[df['SOC_NAME'].str.contains('chemist','physicist')] = 'Advance Sciences'
df.OCCUPATION[df['SOC_NAME'].str.contains('biology','scientist')] = 'Advance Sciences'
df.OCCUPATION[df['SOC_NAME'].str.contains('biologi','clinical research')] = 'Advance Sciences'
df.OCCUPATION[df['SOC_NAME'].str.contains('public relation','manage')] = 'Management Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('management','operation')] = 'Management Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('chief','plan')] = 'Management Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('executive')] = 'Management Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('advertis','marketing')] = 'Marketing Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('promotion','market research')] = 'Marketing Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('business','business analyst')] = 'Business Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('business systems analyst')] = 'Business Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('accountant','finance')] = 'Financial Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('financial')] = 'Financial Occupation'
df.OCCUPATION[df['SOC_NAME'].str.contains('engineer','architect')] = 'Architecture & Engineering'
df.OCCUPATION[df['SOC_NAME'].str.contains('surveyor','carto')] = 'Architecture & Engineering'
df.OCCUPATION[df['SOC_NAME'].str.contains('technician','drafter')] = 'Architecture & Engineering'
df.OCCUPATION[df['SOC_NAME'].str.contains('information security','information tech')] = 'Architecture & Engineering'
df['OCCUPATION']= df.OCCUPATION.replace(np.nan, 'Others', regex=True)

In [21]:
## Simplifying Worksite State and capturing it in another variable
df['state'] = df['EMPLOYER_STATE']


In [22]:
print(df.head())

          CASE_NUMBER CASE_STATUS DECISION_DATE VISA_CLASS  \
0  I-200-18026-338377   CERTIFIED      2/2/2018       H-1B   
1  I-200-17296-353451   CERTIFIED    10/27/2017       H-1B   
2  I-200-18242-524477   CERTIFIED      9/6/2018       H-1B   
3  I-200-18070-575236   CERTIFIED     3/30/2018       H-1B   
4  I-200-18243-850522   CERTIFIED      9/7/2018       H-1B   

                 EMPLOYER_NAME EMPLOYER_STATE  \
0        microsoft corporation             WA   
1       ernst & young u.s. llp             NJ   
2                 logixhub llc             TX   
3  hexaware technologies, inc.             NJ   
4             ecloud labs,inc.             NJ   

                                      JOB_TITLE SOC_CODE  \
0                             SOFTWARE ENGINEER  15-1132   
1                                    TAX SENIOR  13-2011   
2                        DATABASE ADMINISTRATOR  15-1141   
3                             SOFTWARE ENGINEER  15-1132   
4  MICROSOFT DYNAMICS CRM APPLIC

In [23]:
from sklearn import preprocessing
class_mapping = {'CERTIFIED':0, 'DENIED':1}
df["CASE_STATUS"] = df["CASE_STATUS"].map(class_mapping)

In [24]:
print(df.head())

          CASE_NUMBER  CASE_STATUS DECISION_DATE VISA_CLASS  \
0  I-200-18026-338377            0      2/2/2018       H-1B   
1  I-200-17296-353451            0    10/27/2017       H-1B   
2  I-200-18242-524477            0      9/6/2018       H-1B   
3  I-200-18070-575236            0     3/30/2018       H-1B   
4  I-200-18243-850522            0      9/7/2018       H-1B   

                 EMPLOYER_NAME EMPLOYER_STATE  \
0        microsoft corporation             WA   
1       ernst & young u.s. llp             NJ   
2                 logixhub llc             TX   
3  hexaware technologies, inc.             NJ   
4             ecloud labs,inc.             NJ   

                                      JOB_TITLE SOC_CODE  \
0                             SOFTWARE ENGINEER  15-1132   
1                                    TAX SENIOR  13-2011   
2                        DATABASE ADMINISTRATOR  15-1141   
3                             SOFTWARE ENGINEER  15-1132   
4  MICROSOFT DYNAMICS CRM 

In [25]:
test1 = pd.Series(df['JOB_TITLE'].ravel()).unique()
print(pd.DataFrame(test1))

                                                       0
0                                      SOFTWARE ENGINEER
1                                             TAX SENIOR
2                                 DATABASE ADMINISTRATOR
3           MICROSOFT DYNAMICS CRM APPLICATION DEVELOPER
4                                SENIOR SYSTEM ARCHITECT
5                            SENIOR ORACLE ADF DEVELOPER
6                            SENIOR SYSTEMS ANALYST JC60
7                  ASSOCIATE PRODUCT MANAGER(15-1199.09)
8                                  SENIOR JAVA DEVELOPER
9                                     SOFTWARE DEVELOPER
10                          GLOBAL MMA ADVANCED ENGINEER
11                                  TECHNICAL CONSULTANT
12                                              ENGINEER
13                                   SOLUTIONS ARCHITECT
14     MEMBER TECHNICAL STAFF CONSULTANT-SYSTEMS ENGI...
15                       BUSINESS DEVELOPMENT SPECIALIST
16                             

In [26]:
# dropping these columns
df = df.drop('EMPLOYER_NAME', axis = 1)
df = df.drop('SOC_NAME', axis = 1)
df = df.drop('CASE_NUMBER', axis = 1)

In [27]:
df1 = df.copy()

In [28]:
df1[['CASE_STATUS','NEW_EMPLOYER','OCCUPATION','state']] = df1[['CASE_STATUS','NEW_EMPLOYER','OCCUPATION','state']].apply(lambda x: x.astype('category'))

In [29]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 589619 entries, 0 to 609033
Data columns (total 10 columns):
CASE_STATUS        589619 non-null category
DECISION_DATE      589619 non-null object
VISA_CLASS         589619 non-null object
EMPLOYER_STATE     589619 non-null object
JOB_TITLE          589619 non-null object
SOC_CODE           589619 non-null object
PREVAILING_WAGE    589619 non-null float64
NEW_EMPLOYER       589619 non-null category
OCCUPATION         589619 non-null category
state              589619 non-null category
dtypes: category(4), float64(1), object(5)
memory usage: 33.7+ MB


## Splitting Data in Training and Test Sets

It's a standard practice to split the dataset into a training and testing set. The reason behind this is that you should fit and train your model using the training set, and then finally predict and check your accuracy on the test set.

In [30]:
X = df.drop('CASE_STATUS', axis=1)
y = df.CASE_STATUS

seed = 7
test_size = 0.40
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
X_train.columns

Index(['DECISION_DATE', 'VISA_CLASS', 'EMPLOYER_STATE', 'JOB_TITLE',
       'SOC_CODE', 'PREVAILING_WAGE', 'NEW_EMPLOYER', 'OCCUPATION', 'state'],
      dtype='object')

In [31]:
# YOUR TRAINING SET CAN HAVE NO NULL VALUES! CHECK THIS HERE:
print(X_train.isnull().sum())

DECISION_DATE      0
VISA_CLASS         0
EMPLOYER_STATE     0
JOB_TITLE          0
SOC_CODE           0
PREVAILING_WAGE    0
NEW_EMPLOYER       0
OCCUPATION         0
state              0
dtype: int64


Encode X_train and X_test to get them ready for Xgboost, as it only works on numeric data. The function pd.get_dummies() is used to encode the categorical values to integers. It will create a transpose of all the categorical values and then map 1 wherever the value is present or 0 if it's not present. You should definitely try at your end to to print the X_train_encode below to check the transpose.

In [32]:
X_train_encode = pd.get_dummies(X_train)
X_test_encode = pd.get_dummies(X_test)

In [33]:
y_train.head()

516415    0
360016    0
168822    0
331333    0
130016    0
Name: CASE_STATUS, dtype: int64

In [34]:
print(X_train_encode)

        PREVAILING_WAGE  DECISION_DATE_1/1/2018  DECISION_DATE_1/10/2018  \
516415          48090.0                       0                        0   
360016         110864.0                       0                        0   
168822          60944.0                       0                        0   
331333          34029.0                       0                        0   
130016          83096.0                       0                        0   
104617          52728.0                       0                        0   
299699         127920.0                       0                        0   
455713          90646.0                       0                        0   
349809          90002.0                       0                        0   
265885          57907.0                       0                        0   
388560          69722.0                       0                        0   
416852          78021.0                       0                        0   
603830      

XGBoost is short term for “Extreme Gradient Boosting”, which is a supervised learning problem. Here you use the training data (with multiple features) x(i) to predict a target variable y(i).

It is an implementation of gradient boosted decision trees designed for speed and performance.

In [35]:
train_X = X_train_encode.as_matrix()
train_y = y_train.as_matrix()

In [36]:
import xgboost
gbm=xgboost.XGBClassifier(max_features='sqrt', subsample=0.8, random_state=10)

## Using GridSearchCV() to tune hyperparameters:

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
parameters = [{'n_estimators': [10, 100]},
              {'learning_rate': [0.1, 0.01, 0.5]}]

In [47]:
grid_search = GridSearchCV(estimator = gbm, param_grid = parameters, scoring='accuracy', cv = 3, n_jobs=-1)

In [44]:
grid_search = grid_search.fit(train_X, train_y)

OSError: [Errno 12] Cannot allocate memory

In [None]:
warnings.filterwarnings("ignore")

grid_search.grid_scores_, grid_search.best_params_, grid_search.best_score_

In [None]:
grid_search.best_estimator_