In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% }</style>"))

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('fake_job_postings.csv')
data.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [4]:
for column in data.columns:
    print('{} has {} different elements'.format(column, len(set(data[column]))))

job_id has 17880 different elements
title has 11231 different elements
location has 3106 different elements
department has 1338 different elements
salary_range has 875 different elements
company_profile has 1710 different elements
description has 14802 different elements
requirements has 11969 different elements
benefits has 6206 different elements
telecommuting has 2 different elements
has_company_logo has 2 different elements
has_questions has 2 different elements
employment_type has 6 different elements
required_experience has 8 different elements
required_education has 14 different elements
industry has 132 different elements
function has 38 different elements
fraudulent has 2 different elements


In [5]:
total_fraudulent = (data.fraudulent == 1).sum()
for field in data.columns:
    nan_field = data[data[field].isna()]
    fraudulent_cnt = (nan_field.fraudulent == 1).sum()
    print('{} stats: {} / {}, {}% of all'.format(field, (nan_field.fraudulent == 1).sum(), nan_field.shape[0], int(100 * fraudulent_cnt / total_fraudulent)))

job_id stats: 0 / 0, 0% of all
title stats: 0 / 0, 0% of all
location stats: 19 / 346, 2% of all
department stats: 531 / 11547, 61% of all
salary_range stats: 643 / 15012, 74% of all
company_profile stats: 587 / 3308, 67% of all
description stats: 1 / 1, 0% of all
requirements stats: 154 / 2695, 17% of all
benefits stats: 364 / 7210, 42% of all
telecommuting stats: 0 / 0, 0% of all
has_company_logo stats: 0 / 0, 0% of all
has_questions stats: 0 / 0, 0% of all
employment_type stats: 241 / 3471, 27% of all
required_experience stats: 435 / 7050, 50% of all
required_education stats: 451 / 8105, 52% of all
industry stats: 275 / 4903, 31% of all
function stats: 337 / 6455, 38% of all
fraudulent stats: 0 / 0, 0% of all


In [6]:
test_size = 0.2
y = data['fraudulent']
del data['fraudulent']
X = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [7]:
def add_noise(data, noise_level):
    return data * (1 + noise_level * np.random.randn(*data.shape))

In [8]:
def target_encode(X_train, X_test, y_train, min_samples_leaf=1, smoothing=1, noise_level=0):
    temp = pd.concat([X_train, y_train], axis=1)
    averages = temp.groupby(by=X_train.name)[y_train.name].agg(["mean", "count"])
    mean = y_train.mean()
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    averages[y_train.name] = mean * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    ft_x_train = pd.merge(
        X_train.to_frame(X_train.name),
        averages.reset_index().rename(columns={'index': y_train.name, y_train.name: 'average'}),
        on=X_train.name,
        how='left')['average'].rename(X_train.name + '_mean').fillna(mean)
    ft_x_train.index = X_train.index 
    ft_x_test = pd.merge(
        X_test.to_frame(X_test.name),
        averages.reset_index().rename(columns={'index': y_train.name, y_train.name: 'average'}),
        on=X_test.name,
        how='left')['average'].rename(X_train.name + '_mean').fillna(mean)
    ft_x_test.index = X_test.index
    return add_noise(ft_x_train, noise_level), add_noise(ft_x_test, noise_level)

In [9]:
# Replace categorical features by target mean
categorical_columns = ['department', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']
for column in categorical_columns:
    X_train[column].fillna('nan_category')
    X_test[column].fillna('nan_category')
    X_train[column], X_test[column] = target_encode(X_train[column], X_test[column], y_train, 10, 10, 0.05)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
# replace complicated features with indicator of their existance
complicated_columns = ['location', 'company_profile', 'salary_range', 'description', 'requirements', 'benefits']
for column in complicated_columns:
    X_train[column] = X_train[column].isna()
    X_test[column] = X_test[column].isna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [11]:
delete_columns = ['job_id', 'title']
for column in delete_columns:
    del X_train[column]
    del X_test[column]

In [12]:
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, precision_score, f1_score, confusion_matrix

In [23]:
# Fit random forest
model = RandomForestClassifier(n_estimators=1000, criterion='entropy', max_depth=10, n_jobs=2)

In [None]:
model = model.fit(X_train, y_train)
preds = model.predict(X_test)
print('after: precision = {:<3}'.format(precision_score(y_test, preds)))
print('confusion matrix:')
print(confusion_matrix(y_test, preds))