# About this Dataset

**[Real or Fake] : Fake Job Description Prediction 
<br>
This dataset contains 18K job descriptions out of which about 800 are fake. The data consists of both textual information and meta-information about the jobs. The dataset can be used to create classification models which can learn the job descriptions which are fraudulent.**


**Download Data From**
https://www.kaggle.com/shivamb/real-or-fake-fake-jobposting-prediction

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import time
import re
import string
import nltk
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from keras.utils import np_utils
from keras.models import Sequential
from sklearn import preprocessing
from tensorflow import keras
import warnings
warnings.filterwarnings("ignore")

# Dataset Loader

In [2]:
path1 ="fake_job_postings.csv"
class DataFrame_Loader():

    
    def __init__(self):
        
        
        pass
        
    def load_data_files(self,path1):
        dftrain = pd.read_csv(path1)
        return dftrain

# Data Preprocessing

In [3]:
class DataFrame_Preprocessor():

    
    def __init__(self):
        
        
        pass
        
        
    def Preprocessor(self,data):
        
        data = data[["title", "company_profile", "description", "requirements", "benefits","fraudulent"]]
        data = data.fillna(' ')
        xdf = data[data.columns[0:-1]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
        target = data['fraudulent']
        return xdf, target
    
    def clean_text(self,text):
        '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.'''
        text = text.lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        return text
    
    def removeNumbers(self,text):
        """ Removes integers """
        text = ''.join([i for i in text if not i.isdigit()])         
        return text
    
    def replaceMultiExclamationMark(self,text):
        """ Replaces repetitions of exlamation marks """
        text = re.sub(r"(\!)\1+", ' multiExclamation ', text)
        return text

    def replaceMultiQuestionMark(self,text):
        """ Replaces repetitions of question marks """
        text = re.sub(r"(\?)\1+", ' multiQuestion ', text)
        return text

    def replaceMultiStopMark(self,text):
        """ Replaces repetitions of stop marks """
        text = re.sub(r"(\.)\1+", ' multiStop ', text)
        return text
    
    def replaceContraction(self,text):
        
        contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
        patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
        for (pattern, repl) in tqdm(patterns):
            (text, count) = re.subn(pattern, repl, text)
        return text
    
    def __replace(self,word, pos=None):
        
        """ Creates a set of all antonyms 
        for the word and if there is only 
        one antonym, it returns it """
        
        antonyms = set()
        for synset in wordnet.synsets(tqdm(word, pos=pos)):
            for lemma in synset.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
        
    def __replaceNegations(self,text):
        
        """ Finds "not" and antonym for 
        the next word and if found, replaces 
        not and the next word with the antonym """
        
        i, l = 0, len(text)
        words = []
        while i < l:
            word = text[i]
            if word == 'not' and i+1 < l:
                ant = self.__replace(text[i+1])
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            words.append(word)
            i += 1
        return words
    
    def tokenize1(self,text):
        finalTokens = []
        tokens = nltk.word_tokenize(text)
        tokens = self.__replaceNegations(tokens)
        for w in tqdm(tokens):
            if (w not in stoplist):
                finalTokens.append(w)
        text = " ".join(finalTokens)
        return text
    
    def stem_words(self,text):
        
        stemmer = PorterStemmer()
        return " ".join([stemmer.stem(word) for word in text.split()])
    
    
class Preprocessor_Execution():

    
    def __init__(self):
        
        
        pass
        
        
    def Execute_Preprocessor(self,df):
        
        preprocess = DataFrame_Preprocessor()
        
        xdf, target = preprocess.Preprocessor(df)
        
        text = xdf.apply(lambda x: preprocess.clean_text(x))
        
        text = xdf.apply(lambda x: preprocess.removeNumbers(x))
        
        text = xdf.apply(lambda x: preprocess.replaceMultiExclamationMark(x))
        
        text = xdf.apply(lambda x: preprocess.replaceMultiQuestionMark(x))
        
        text = xdf.apply(lambda x: preprocess.replaceMultiStopMark(x))
        
        text = xdf.apply(lambda x: preprocess.replaceContraction(x))
        
        text = xdf.apply(lambda x: preprocess.tokenize1(x))
        
        text = xdf.apply(lambda x: preprocess.stem_words(x))
        
        return text,target

# Train Test Splitter

In [4]:
class Train_test_Splitter():
    """
    Split Data Into train and test set
    """
    
    def __init__(self):
        
        
        pass
        
        
    def Split(self,text,target):
        
        return train_test_split(text, target, test_size=0.2, random_state=4, stratify=target)

# Tokenization

In [5]:
class Keras_Tokenizer():

    
    def __init__(self,max_features):
        
        self.max_features =6000
        
        
        
        
        
    def __label_encoding(self,y_train):
        """
        Encode the given list of class labels
        :y_train_enc: returns list of encoded classes
        :labels: actual class labels
        """
        lbl_enc = LabelEncoder()

        y_train_enc = lbl_enc.fit_transform(y_train)
        labels = lbl_enc.classes_

        return y_train_enc, labels
    
    
    
    def __word_embedding(self,train, test, max_features, max_len=200):
        
        
        try:
            """ Keras Tokenizer class object """
            tokenizer = text.Tokenizer(num_words=max_features)
            tokenizer.fit_on_texts(train)

            train_data = tokenizer.texts_to_sequences(train)
            test_data = tokenizer.texts_to_sequences(test)

            """ Get the max_len """
            vocab_size = len(tokenizer.word_index) + 1

            """ Padd the sequence based on the max-length """
            x_train = sequence.pad_sequences(train_data, maxlen=max_len, padding='post')
            x_test = sequence.pad_sequences(test_data, maxlen=max_len, padding='post')
            """ Return train, test and vocab size """
            return tokenizer, x_train, x_test, vocab_size
        except ValueError as ve:
            raise(ValueError("Error in word embedding {}".format(ve)))
            
            
    def preprocess(self,X_train, X_test):
        
        tokenizer,x_pad_train, x_pad_valid, vocab_size = self.__word_embedding(X_train, X_test, self.max_features)
    
        return tokenizer,x_pad_train, x_pad_valid, vocab_size

# Glove Embedding

In [7]:
class Glove_Vectors_Loader():

    
    def __init__(self,FileNmae,Mode,encoding):
        self.FileName = 'glove.6B.200d.txt'
        self.Mode = 'r'
        self.encoding = 'cp437'
        
        
        
    def load_Glove_Vectors(self):
        embeddings_index={}
        with open(self.FileName ,self.Mode ,encoding =self.encoding ) as f:
            for line in tqdm(f):
                values=line.split()
                word=values[0]
                vectors=np.asarray(values[1:],'float32')
                embeddings_index[word]=vectors
        f.close()
        return embeddings_index
    
    def __sent2vec(self,s):
        words = str(s).lower()
        words = word_tokenize(words)
        words = [w for w in words if not w in stoplist]
        words = [w for w in words if w.isalpha()]
        M = []
        for w in words:
            try:
                M.append(embeddings_index[w])
            except:
                continue
        M = np.array(M)
        v = M.sum(axis=0)
        if type(v) != np.ndarray:
            return np.zeros(200)
        return v / np.sqrt((v ** 2).sum())
    
    def Fit_Transform(self,X_train,X_test):
        
        xtrain_glove = np.array([self.__sent2vec(x) for x in tqdm(X_train)])
        xtest_glove = np.array([self.__sent2vec(x) for x in tqdm(X_test)])
        return xtrain_glove,xtest_glove
    
class Glove_Vectors_Execution():

    
    def __init__(self):
        
        
        pass
        
        
    def Execute_Glove_Vectors(self,X_train,X_test):
        

        GL = Glove_Vectors_Loader('glove.6B.200d.txt','r','cp437')
        
        embeddings_index = GL.load_Glove_Vectors()
        
        xtrain_glove,xtest_glove  = (lambda x,y: GL.Fit_Transform(x,y))(X_train,X_test)
        
        return embeddings_index,xtrain_glove,xtest_glove

# Bidirectional lstm RNN Architecture

In [8]:
class RNN_Bidirectional_lstm_Build_Pack():

    
    def __init__(self,
                 input_length,
                 output_length,
                 vocab_size,
                 optimizer,
                 loss,
                 metrics,
                 batch_size,
                 epochs,
                 verbose):
        
        self.input_length =200
        self.output_length= 200
        self.vocab_size = 95708
        self.optimizer = 'adam'
        self.loss = 'binary_crossentropy'
        self.metrics = ['acc']
        self.batch_size = 256
        self.epochs = 10
        self.verbose = 1
        
        
        

        
    
    def build_rnn(self,vocab_size,output_dim, input_dim):

        model = Sequential([
            keras.layers.Embedding(self.vocab_size,output_dim = self.output_length,
                                  input_length = self.input_length),
            keras.layers.BatchNormalization(),
            keras.layers.Bidirectional(keras.layers.LSTM(256,return_sequences=True)),
            keras.layers.GlobalMaxPool1D(),
            keras.layers.Dense(225),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(150),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(95),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(64),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(34),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(32),
            keras.layers.Dense(output_dim, activation='sigmoid')
        ])

        return model
    
    
    def Compile_and_Fit(self,x_pad_train,y_train,x_pad_valid,y_test,rnn_model):
        
        try:
    
            rnn_model.compile(optimizer=self.optimizer, loss=self.loss, metrics=self.metrics)


            rnn_model.fit(x_pad_train, 
                                    y_train,
                                    batch_size=self.batch_size,
                                   epochs=self.epochs,
                                   verbose= self.verbose)

            score = rnn_model.evaluate(x_pad_valid, y_test, verbose=1)

            print("Loss:%.3f Accuracy: %.3f" % (score[0], score[1]))

            return rnn_model
        
        except ValueError as Model_Error:
            raise(ValueError("Model Compiling Error {}".format(Model_Error)))

# End  Pipeline

In [9]:
class End_Pipeline_Excecution():
    
    def __init__(self):
        
        
        print("End_Pipeline_Excecution Object Created")
        
        
    def Execute_Fulll_Pipeline(self):
        
        load = DataFrame_Loader()
        PE = Preprocessor_Execution()
        TS = Train_test_Splitter()
        KT = Keras_Tokenizer(6000)
        GVE  = Glove_Vectors_Execution()
        Rnn_Model = RNN_Bidirectional_lstm_Build_Pack(200,200,95708,'adam','binary_crossentropy',['acc'],256,10,1)
        df = load.load_data_files(path1)
        print("DataFrame Shape",df.shape)
        text,target = PE.Execute_Preprocessor(df)
        X_train, X_test, y_train, y_test = TS.Split(text,target)
        print("After Train Test Split",X_train.shape, X_test.shape, y_train.shape, y_test.shape)
        tokenizer, x_pad_train, x_pad_valid, vocab_size = KT.preprocess(X_train, X_test)
        print("After Tokenization",tokenizer, x_pad_train.shape, x_pad_valid.shape, vocab_size)
        embeddings_index,xtrain_glove,xtest_glove  = GVE.Execute_Glove_Vectors(x_pad_train,x_pad_valid)
        print("After Glove Embedding",len(embeddings_index),xtrain_glove.shape,xtest_glove.shape)
        rnn_model = Rnn_Model.build_rnn(vocab_size,1,200)
        print(rnn_model.summary())
        rnn_model = Rnn_Model.Compile_and_Fit(x_pad_train,y_train,x_pad_valid,y_test,rnn_model)
        return tokenizer,rnn_model,x_pad_valid,y_test
    
End = End_Pipeline_Excecution()
tokenizer,rnn_model,x_pad_valid,y_test = End.Execute_Fulll_Pipeline()

End_Pipeline_Excecution Object Created
DataFrame Shape (17880, 18)
After Train Test Split (14304,) (3576,) (14304,) (3576,)


1705it [00:00, 16926.25it/s]

After Tokenization <keras_preprocessing.text.Tokenizer object at 0x00000226B06CC790> (14304, 200) (3576, 200) 112655


400000it [00:24, 16617.51it/s]
100%|███████████████████████████████████████████████████████████████████████████| 14304/14304 [00:20<00:00, 711.48it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 3576/3576 [00:05<00:00, 709.16it/s]


After Glove Embedding 400000 (14304, 200) (3576, 200)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 200)          19141600  
_________________________________________________________________
batch_normalization (BatchNo (None, 200, 200)          800       
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 512)          935936    
_________________________________________________________________
global_max_pooling1d (Global (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 225)               115425    
_________________________________________________________________
dropout (Dropout)            (None, 225)               0         
_________________________________________________________________
de

# Prediction

In [10]:
y_preds = rnn_model.predict(x_pad_valid)

print("y_preds Shape ::",y_preds.shape)


for arr in y_preds:
    for i in range(len(arr)):
        if arr[i]>0.5:
            arr[i] = 1
        else:
            arr[i] = 0

            
y_preds = y_preds.astype('int32')

pred_df = pd.DataFrame(y_preds, columns=['pred'])

print(pred_df.shape)
pred_df.head()

print(pred_df.value_counts())

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(metrics.accuracy_score(y_test, pred_df))
        
print(metrics.confusion_matrix(y_test, pred_df))
        
print(metrics.classification_report(y_test, pred_df))

y_preds Shape :: (3576, 1)
(3576, 1)
pred
0       3437
1        139
dtype: int64
0.9832214765100671
[[3390   13]
 [  47  126]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3403
           1       0.91      0.73      0.81       173

    accuracy                           0.98      3576
   macro avg       0.95      0.86      0.90      3576
weighted avg       0.98      0.98      0.98      3576

