# Capstone: Fake job posting - Processing and Model 

In [11]:
#import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import statistics as s
from sklearn.metrics import confusion_matrix 

In [23]:
df = pd.read_csv('fake_job_posting_new.csv')

In [24]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,...,requirement_required,requirement_communiction,requirement_team,requirement_management,requirement_degree,requirement_working,requirement_strong,requirement_development,requirement_customer,total_requirements_keyword_count
0,0,1,0,1,0,Other,Internship,,Marketing,0,...,0,1,0,1,0,2,0,0,0,8
1,1,2,0,1,0,Full-time,Not Applicable,,Marketing,0,...,0,1,1,3,0,1,0,0,2,20


In [25]:
#drop unwanted columns
df = df.drop(['Unnamed: 0'], axis=1)

In [26]:
df.head(2)

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,min_salary,...,requirement_required,requirement_communiction,requirement_team,requirement_management,requirement_degree,requirement_working,requirement_strong,requirement_development,requirement_customer,total_requirements_keyword_count
0,1,0,1,0,Other,Internship,,Marketing,0,0,...,0,1,0,1,0,2,0,0,0,8
1,2,0,1,0,Full-time,Not Applicable,,Marketing,0,0,...,0,1,1,3,0,1,0,0,2,20


In [27]:
#categorical data
categorical_cols = ['employment_type','required_experience','required_education','function'] 

#import pandas as pd
df = pd.get_dummies(df, columns = categorical_cols)

In [28]:
df.head(2)

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent,min_salary,max_salary,desc_word_count,clean_desc_word_count,benefits_word_count,...,function_Creative,function_Engineering,function_Finance,function_General Business,function_Human Resources,function_Information Technology,function_Manufacturing,function_Marketing,function_None,function_Others
0,1,0,1,0,0,0,0,124,83,1,...,0,0,0,0,0,0,0,1,0,0
1,2,0,1,0,0,0,0,309,190,225,...,0,0,0,0,0,0,0,1,0,0


In [29]:
df.dtypes

job_id                             int64
telecommuting                      int64
has_company_logo                   int64
has_questions                      int64
fraudulent                         int64
                                   ...  
function_Information Technology    uint8
function_Manufacturing             uint8
function_Marketing                 uint8
function_None                      uint8
function_Others                    uint8
Length: 94, dtype: object

In [30]:
df.astype('int64').dtypes

job_id                             int64
telecommuting                      int64
has_company_logo                   int64
has_questions                      int64
fraudulent                         int64
                                   ...  
function_Information Technology    int64
function_Manufacturing             int64
function_Marketing                 int64
function_None                      int64
function_Others                    int64
Length: 94, dtype: object

### Modelling 

#### setting variables

In [31]:
# setting X and y variables
y = df['fraudulent'].values
X = df.drop('fraudulent', axis=1).values

#### Setting train and test data 

In [32]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(13410, 93) (13410,)
(4470, 93) (4470,)


#### Model 1: Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix

lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
        
# accuracy scores
print( 'Training Set Accuracy Score: ', lr.score(X_train, y_train))
print('Test Set Accuracy Score: ', lr.score(X_test, y_test))
    
# classification metrics
print('Classification Metrics \n')
print(classification_report(y_test, y_pred))

Training Set Accuracy Score:  0.9515287099179717
Test Set Accuracy Score:  0.9514541387024609
Classification Metrics 

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      4254
           1       0.00      0.00      0.00       216

    accuracy                           0.95      4470
   macro avg       0.48      0.50      0.49      4470
weighted avg       0.91      0.95      0.93      4470



#### Model 2: K-Nearest Neighbors (KNN)

In [34]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [35]:
y_pred = knn.predict(X_test)

In [36]:
# accuracy scores
print('Training Set Accuracy Score: ', knn.score(X_train, y_train))
print('Test Set Accuracy Score: ', knn.score(X_test, y_test))
    
# classificatin report
print('Classification Metrics \n')
print(classification_report(y_test, y_pred))

Training Set Accuracy Score:  0.9734526472781506
Test Set Accuracy Score:  0.96331096196868
Classification Metrics 

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4254
           1       0.72      0.40      0.51       216

    accuracy                           0.96      4470
   macro avg       0.84      0.70      0.75      4470
weighted avg       0.96      0.96      0.96      4470



#### Model 3: Decision Tree

In [37]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
    
    
# accuracy scores
print('Training Set Accuracy Score: ', decision_tree.score(X_train, y_train))
print('Test Set Accuracy Score: ', decision_tree.score(X_test, y_test))

# classification report
print('Classification Metrics \n')
print(classification_report(y_test, y_pred))

Training Set Accuracy Score:  1.0
Test Set Accuracy Score:  0.9715883668903803
Classification Metrics 

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4254
           1       0.69      0.74      0.72       216

    accuracy                           0.97      4470
   macro avg       0.84      0.86      0.85      4470
weighted avg       0.97      0.97      0.97      4470



#### Model 4: Support Vector Machine

In [38]:
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)

# accuracy scores
print('Training Set Accuracy Score: ', svm_clf.score(X_train, y_train))
print('Test Set Accuracy Score: ', svm_clf.score(X_test, y_test))

# classification report
print('Classification Metrics \n')
print(classification_report(y_test, y_pred))

Training Set Accuracy Score:  0.9515287099179717
Test Set Accuracy Score:  0.9516778523489933
Classification Metrics 

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      4254
           1       0.00      0.00      0.00       216

    accuracy                           0.95      4470
   macro avg       0.48      0.50      0.49      4470
weighted avg       0.91      0.95      0.93      4470



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Model 5: Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
    
    
# accuracy scores
print('Training Set Accuracy Score: ', rf.score(X_train, y_train))
print('Test Set Accuracy Score: ', rf.score(X_test, y_test))
    
# classification report
print('Classification Metrics \n')
print(classification_report(y_test, y_pred))

Training Set Accuracy Score:  1.0
Test Set Accuracy Score:  0.9823266219239374
Classification Metrics 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4254
           1       0.99      0.64      0.78       216

    accuracy                           0.98      4470
   macro avg       0.99      0.82      0.88      4470
weighted avg       0.98      0.98      0.98      4470



#### Mdoel 6: Gradient Boosting Classfier

In [40]:
from sklearn.ensemble import GradientBoostingClassifier
gradient_booster = GradientBoostingClassifier()
gradient_booster.fit(X_train,y_train)
y_pred = gradient_booster.predict(X_test)


# accuracy scores
print('Training Set Accuracy Score: ', gradient_booster.score(X_train, y_train))
print('Test Set Accuracy Score: ', gradient_booster.score(X_test, y_test))
    
# classification report
print('Classification Metrics \n')
print(classification_report(y_test, y_pred))

Training Set Accuracy Score:  0.9777777777777777
Test Set Accuracy Score:  0.9704697986577181
Classification Metrics 

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      4254
           1       0.95      0.41      0.57       216

    accuracy                           0.97      4470
   macro avg       0.96      0.71      0.78      4470
weighted avg       0.97      0.97      0.96      4470

