# Step 1 - TRAIN

In [11]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score

#working with text
from sklearn.feature_extraction.text import TfidfVectorizer

#normalizing data
from sklearn.preprocessing import StandardScaler

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets

In [12]:
df = pd.read_csv("fake_job_postings.csv")
df.head(3)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0


In [13]:
df['fraudulent'].value_counts()

fraudulent
0    17014
1      866
Name: count, dtype: int64

### Разделим данные на train/test и сохраним тестовую выборку на диск

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df, df['fraudulent'], test_size=0.33, random_state=42, stratify=df['fraudulent'])
# save test
X_test.to_csv("data/X_test.csv", index=None)
y_test.to_csv("data/y_test.csv", index=None)

# save train
X_train.to_csv("data/X_train.csv", index=None)
y_train.to_csv("data/y_train.csv", index=None)

In [15]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]


class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [16]:
features = ['description', 'company_profile', 'benefits', 'requirements']
target = 'fraudulent'

### Соберем кусок, ответственный за feature engineering

In [17]:
# combine
description = Pipeline([
                ('imputer', TextImputer('description', '')),
                ('selector', ColumnSelector(key='description')),
                ('tfidf', TfidfVectorizer())
            ])

company_profile = Pipeline([
                ('imputer', TextImputer('company_profile', '')),
                ('selector', ColumnSelector(key='company_profile')),
                ('tfidf', TfidfVectorizer())
            ])

benefits = Pipeline([
                ('imputer', TextImputer('benefits', '')),
                ('selector', ColumnSelector(key='benefits')),
                ('tfidf', TfidfVectorizer())
            ])

requirements = Pipeline([
                ('imputer', TextImputer('requirements', '')),
                ('selector', ColumnSelector(key='requirements')),
                ('tfidf', TfidfVectorizer())
            ])


feats = FeatureUnion([('description', description),
                      ('company_profile', company_profile),
                      ('benefits', benefits),
                      ('requirements', requirements)])

### Добавим классификатор

In [18]:
%%time

pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression()),
])

pipeline.fit(X_train, y_train)

CPU times: total: 3.77 s
Wall time: 3.76 s


### Посмотрим, как выглядит pipeline

In [19]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('description',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='description',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='description')),
                                                  ('tfidf', TfidfVectorizer())])),
                                 ('company_profile',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='company_profile',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='company_profile')),
                                                  ('tfi

### Сохраним модель (пайплайн)

In [20]:
with open("models/logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

# Step 2 - PREDICT

### Проверка работоспособности и качества пайплайна

In [21]:
X_test = pd.read_csv("data/X_test.csv")
y_test = pd.read_csv("data/y_test.csv")

In [22]:
X_test.head(3)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1511,Marketing Representative,"US, CA, Sacramento",,,Tidewater Finance Co. was established in 1992 ...,Financial Services Company is seeking a full-t...,This position requires the following qualifica...,Our company offers a competitive salary as wel...,0,1,0,Full-time,Associate,Unspecified,Financial Services,Marketing,0
1,17531,Casual job/Immediate start,"AU, ,",,,,Looking for work?We are looking for a self m...,- No experience necessary - Training provided,- GREAT TEAM CULTURE AND IMMEDIATE START!!!,0,0,0,,,,,,1
2,10898,Office Manager - Professional Multi-tasker,"US, ,",,40000-47000,Airenvy’s mission is to provide lucrative yet ...,Who is Airenvy?Hey there! We are seasoned entr...,High-School diploma Extremely organizedProcess...,Competitive Pay. You'll be able to eat steak e...,0,1,1,Full-time,Entry level,High School or equivalent,Internet,Administrative,0


In [24]:
with open('models/logreg_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [25]:
pipeline

In [26]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [27]:
preds[:10]

array([0.01947227, 0.32992821, 0.02001141, 0.02618238, 0.00356344,
       0.00249822, 0.02481659, 0.00263951, 0.00316769, 0.00364648])

In [28]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.20669968796447724, F-Score=0.853, Precision=0.875, Recall=0.832
