In [2]:
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import seaborn as sns
from wordcloud import WordCloud
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score,plot_confusion_matrix,classification_report,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
df = pd.read_csv('./fake_job_data.csv')
df

In [None]:
df.head()


In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum() #using to check null values in dataset.

In [None]:
df = df.drop(["job_id","telecommuting","has_company_logo","has_questions","salary_range","employment_type"],axis=1)

In [None]:
df.head()


In [None]:
df.fillna("",inplace=True)

In [None]:
df.isnull().sum()


In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='fraudulent',data=df)
plt.show()

In [None]:
df.groupby('fraudulent')['fraudulent'].count()

In [None]:
exp = dict(df.required_experience.value_counts())
exp
del exp['']
exp

In [None]:
plt.figure(figsize=(10,5))
plt.title("Jobs by Experience",size=25)
plt.bar(exp.keys(),exp.values())

In [None]:
df

In [None]:
def split(location):
    loc = location.split(',')
    return loc[0];
df['country'] = df.location.apply(split)

In [None]:
df.head()

In [None]:
country = dict(df.country.value_counts()[:10])
del country[''] #Deleting Blank value countries from dictionaries
country

In [None]:
plt.bar(country.keys(),country.values())
plt.title("Country-wise Job Postings")

In [None]:
edu = dict(df.required_education.value_counts()[:6])
del edu[''] #Deleting Blank value edu from dictionaries
edu

In [None]:
plt.figure(figsize=(15,7))
plt.title('Jobs posting by education',size=25)
plt.bar(edu.keys(),edu.values(),color="lightblue")
plt.xlabel("Education",size=15)
plt.ylabel('Jobs',size=15)

In [None]:
print(df[df.fraudulent==0].title.value_counts()[:20]) #Genuine jobs postings comes usually with this titles

In [None]:
print(df[df.fraudulent==1].title.value_counts()[:20]) #Fraudulent jobs postings comes usually with this titles

In [None]:
df['text'] = df['title']+' '+df['company_profile']+' '+df['description']+' '+df['requirements']+' '+df['benefits']

In [None]:

df2 = df.copy()

In [None]:
del df['title']
del df['location']
del df['department']
del df["company_profile"]
del df["description"]
del df["requirements"]
del df["benefits"]
del df["required_experience"]
del df["required_education"]
del df["industry"]
del df["function"]
del df["country"]

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
fraudjobs_text = df[df.fraudulent == 1].text # 1 for fraud jobs
realjobs_text = df[df.fraudulent == 0].text # 0 for real jobs

In [None]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS
plt.figure(figsize=(20,15))
wc = WordCloud(min_font_size = 3,max_words = 2000, width = 1600, height = 800, stopwords=STOPWORDS).generate(str("".join(fraudjobs_text)))
plt.imshow(wc,interpolation = "bilinear")  #fraud jobs keywords 

In [None]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS
plt.figure(figsize=(20,15))
wc = WordCloud(min_font_size = 3,max_words = 2000, width = 1600, height = 800, stopwords=STOPWORDS).generate(str("".join(realjobs_text)))
plt.imshow(wc,interpolation = "bilinear")  #real job keywords

In [None]:
!pip install spacy && python -m spacy download en

In [None]:
punctuations = string.punctuation

nlp  = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

parser  = English()

def spacy_tokenizer(sentence):
    mytoken = parser(sentence)
    mytokens = [word.lemma.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    return mytokens

class predictors(TransformerMixin):
    def transform(self,X,**transform_params):
        return [clean_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self,deep = True):
        return {}
    
def clean_text(text):
    return text.strip().lower()


In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
cv = TfidfVectorizer(max_features=100)
x = cv.fit_transform(df['text'])
df1 = pd.DataFrame(x.toarray(),columns = cv.get_feature_names())
df.drop(["text"],axis=1,inplace=True)
main_df = pd.concat([df1,df],axis=1)

In [None]:
main_df


In [None]:
Y = main_df.iloc[:,-1]
X = main_df.iloc[:,:-1]

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier  
from sklearn import svm

In [None]:
lg_model = LogisticRegression(random_state=0).fit(x_train,y_train)
dtc_model = DecisionTreeClassifier(random_state=0).fit(x_train,y_train)
rfc_model = RandomForestClassifier(n_jobs=3,oob_score=True,n_estimators=100,criterion="entropy").fit(x_train,y_train)
svm_model = svm.SVC().fit(x_train,y_train)
knn_model = KNeighborsClassifier(n_neighbors=3).fit(x_train,y_train)

In [None]:
lg_pred  = lg_model.predict(x_test)
dtc_pred = dtc_model.predict(x_test)
rfc_pred = rfc_model.predict(x_test)
svm_pred = svm_model.predict(x_test)
knn_pred = knn_model.predict(x_test)

In [None]:
target_names = ["Real-0","Fake-1"]
lg_score = accuracy_score(y_test,lg_pred)
print("Logistic Regression Accuracy: " , lg_score*100)
print("Confusion Matrix: " ,confusion_matrix(y_test,lg_pred)) # For Logistic Regression Confusion Matrix
print(classification_report(y_test, lg_pred,target_names = target_names))

In [None]:
target_names = ["Real-0","Fake-1"]
dtc_score = accuracy_score(y_test,dtc_pred)
print("Decision Tree Accuracy: " , dtc_score*100)
print("Confusion Matrix: " ,confusion_matrix(y_test,dtc_pred)) # For Decision Tree Confusion Matrix
print(classification_report(y_test, dtc_pred,target_names = target_names))

In [None]:
target_names = ["Real-0","Fake-1"]
rfc_score = accuracy_score(y_test,rfc_pred)
print("Random Forest Classifier Accuracy: " , rfc_score*100)
print("Confusion Matrix: " ,confusion_matrix(y_test,rfc_pred)) # For Random Forest Classifier Confusion Matrix
print(classification_report(y_test, rfc_pred,target_names = target_names))

In [None]:
target_names = ["Real-0","Fake-1"]
svm_score = accuracy_score(y_test,svm_pred)
print("Support Vector Machines (SVM) Accuracy: " , svm_score*100)
print("Confusion Matrix: " ,confusion_matrix(y_test,svm_pred)) # For Support Vector Machines Confusion Matrix
print(classification_report(y_test, svm_pred,target_names = target_names))

In [None]:
target_names = ["Real-0","Fake-1"]
knn_score = accuracy_score(y_test,knn_pred)
print("K-Nearest Neighbor (KNN) Accuracy: " , knn_score*100)
print("Confusion Matrix: " ,confusion_matrix(y_test,knn_pred)) # For K-Nearest Neighbor (KNN) Confusion Matrix
print(classification_report(y_test, knn_pred,target_names = target_names))

In [None]:
print("Logistic Regression Accracy: " , lg_score*100)
print("Decision Tree Accracy: " , dtc_score*100)
print("Random Forest Accracy: " , rfc_score*100)
print("Support Vector Machines (SVM) Accracy: " , svm_score*100)
print("KNN Accracy: " , knn_score*100) 

### KNN Algorithm gives us Higher Prediction which is almost 97.42% 

#### So, model train by KNN Algorithm is the best model for this dataset.