In [None]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

plt.rcParams["figure.figsize"]=(12,8)

In [None]:
data=pd.read_csv("../input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv")

In [None]:
data.head(1)

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data.duplicated().sum()

In [None]:
data.isnull().sum()/len(data)*100

**Department, salary_range, company_profile, requirements, benefits, employment_type, required_experience, required_education, industry, function columns have very high percentage of missing value**

In [None]:
sns.heatmap(data.isnull());

In [None]:
text_data=data.select_dtypes(include="object")
text_data.drop(["location","salary_range"],axis=1,inplace=True)

text_col=text_data.columns

In [None]:
text_col

**As text_col contains missing data replace them with empty string**

In [None]:
data[text_col]=data[text_col].replace(np.nan,"")

**data["text"] Containing all the information provided in job ads**

In [None]:
data["text"]=""

In [None]:
for col in text_data.columns:
    data["text"]=data["text"]+" "+data[col]

In [None]:
data["text"].iloc[0]

**Creating feature that will tell whether information of particular type is provided or not**

In [None]:
def fea(text):
    if text=="":
        return 0
    else:
        return 1

In [None]:
for col in text_col:
    data[col]=data[col].apply(fea)

In [None]:
data.isnull().sum()/len(data)*100

In [None]:
data.drop(["salary_range","job_id"],axis=1,inplace=True)

**As the remaining missing values are less (2%) we drop them**

In [None]:
data.dropna(axis=0,inplace=True)

**Visualization**

In [None]:
fake=data[data["fraudulent"]==0]["fraudulent"].count()
real=data[data["fraudulent"]==1]["fraudulent"].count()

In [None]:
plt.pie([fake,real],labels=["fake","real"]);

In [None]:
data["text_len"]=data["text"].str.len()

In [None]:
sns.histplot(x=data[data["fraudulent"]==1]["text_len"],kde=True)
sns.histplot(x=data[data["fraudulent"]==0]["text_len"],kde=True,color="red");

**Fake job have less information then real job**

In [None]:
sns.heatmap(data[["telecommuting","has_company_logo","has_questions","fraudulent"]].corr(),annot=True);

**We see has_company_logo is strongly correlated with fradulent columns**

In [None]:
sns.countplot(x="telecommuting",hue="fraudulent",data=data);

In [None]:
sns.countplot(x="has_company_logo",hue="fraudulent",data=data);

In [None]:
sns.countplot(x="has_questions",hue="fraudulent",data=data);

In [None]:
features=['title','department', 'company_profile', 'requirements','description',
          'benefits', 'employment_type', 'required_experience',
          'required_education', 'industry', 'function','fraudulent']

In [None]:
sns.heatmap(data[features].corr(),annot=True);

**Company profile information is strongly correlated with the job label, while others don't have significant correlation and therefore droping them**

In [None]:
drop_col=['title','department', 'description', 'requirements',
          'benefits', 'employment_type', 'required_experience',
          'required_education', 'industry', 'function']

data.drop(drop_col,axis=1,inplace=True)

**Location information: using only country code**

In [None]:
def code(string):
    return string.split(",")[0]

In [None]:
data["country"]=data["location"].apply(code)

In [None]:
p=data.groupby("country")["country"].count().sort_values(ascending=False)

In [None]:
p[:5]

In [None]:
grp=["US","GB","GR","CA","DE"]

In [None]:
sns.countplot(x="country",hue="fraudulent",data=data[data["country"].isin(grp)],order=grp);

**Considering only for US**

In [None]:
data=data[data["country"]=="US"]
data.drop(columns=["country","location"],axis=1,inplace=True)

In [None]:
data.info()

In [None]:
data.reset_index(drop=True,inplace=True)

In [None]:
series=data["fraudulent"].value_counts()

In [None]:
plt.pie(series,labels=series.index); 

**Text Cleaning**

**Importing libraries for cleaning textual data**

In [None]:
import re
import nltk

In [None]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

In [None]:
stop=set(stopwords.words("english"))

In [None]:
data["text"].iloc[0]

In [None]:
def clean(text):
    
    text=text.lower()
    obj=re.compile(r"<.*?>")                     #removing html tags
    text=obj.sub(r" ",text)
    obj=re.compile(r"https://\S+|http://\S+")    #removing url
    text=obj.sub(r" ",text)
    obj=re.compile(r"[^\w\s]")                   #removing punctuations
    text=obj.sub(r" ",text)
    obj=re.compile(r"\d{1,}")                    #removing digits
    text=obj.sub(r" ",text)
    obj=re.compile(r"_+")                        #removing underscore
    text=obj.sub(r" ",text)
    obj=re.compile(r"\s\w\s")                    #removing single character
    text=obj.sub(r" ",text)
    obj=re.compile(r"\s{2,}")                    #removing multiple spaces
    text=obj.sub(r" ",text)
   
    
    stemmer = SnowballStemmer("english")
    text=[stemmer.stem(word) for word in text.split() if word not in stop]
    
    return " ".join(text)

In [None]:
data["text"]=data["text"].apply(clean)

In [None]:
data["text"].iloc[0]

**Visualizatioins**

In [None]:
from wordcloud import WordCloud,STOPWORDS
from collections import defaultdict
from nltk import ngrams

In [None]:
def generate(text,ngram):
    n_grams=ngrams(word_tokenize(text),ngram)
    grams=[" ".join(val) for val in n_grams]
    return grams

In [None]:
real_job=data[data["fraudulent"]==1]["text"].values

In [None]:
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = STOPWORDS).generate(str(real_job))

fig = plt.figure(figsize = (30,20))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off');

In [None]:
pos_1=defaultdict(int)

for text in data[data["fraudulent"]==1]["text"]:
    for words in generate(text,1):
        pos_1[words]+=1
        
pos=pd.DataFrame(sorted(pos_1.items(),key=lambda x: x[1],reverse=True))
plt.barh(pos[0][:10],pos[1][:10])

In [None]:
pos_2=defaultdict(int)

for text in data[data["fraudulent"]==1]["text"]:
    for words in generate(text,2):
        pos_2[words]+=1
        
pos=pd.DataFrame(sorted(pos_2.items(),key=lambda x: x[1],reverse=True))
plt.barh(pos[0][:10],pos[1][:10])

In [None]:
fake_job=data[data["fraudulent"]==0]["text"].values

In [None]:
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = STOPWORDS).generate(str(fake_job))

fig = plt.figure(figsize = (30,20))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off');

In [None]:
neg_1=defaultdict(int)

for text in data[data["fraudulent"]==0]["text"].values:
    for words in generate(text,1):
        neg_1[words]+=1
        
neg=pd.DataFrame(sorted(neg_1.items(),key=lambda x: x[1],reverse=True))
plt.barh(neg[0][:10],neg[1][:10])

In [None]:
neg_2=defaultdict(int)

for text in data[data["fraudulent"]==0]["text"].values:
    for words in generate(text,2):
        neg_2[words]+=1
        
neg=pd.DataFrame(sorted(neg_2.items(),key=lambda x: x[1],reverse=True))
plt.barh(neg[0][:10],neg[1][:10])

**Preprocessing**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer=TfidfVectorizer(strip_accents='unicode',
                           analyzer='word',
                           ngram_range=(1, 2),
                           max_features=15000,
                           smooth_idf=True,
                           sublinear_tf=True)

vectorizer.fit(data["text"])
X = vectorizer.transform(data["text"])

**Dimensionality Reduction**

In [None]:
from sklearn.decomposition import PCA

In [None]:
X.shape

In [None]:
pca = PCA(n_components=0.95)

X = pca.fit_transform(X.toarray())

In [None]:
X.shape

In [None]:
x=range(1,X.shape[1]+1)
cumsum = np.cumsum(pca.explained_variance_ratio_)

sns.scatterplot(x,y=cumsum);

**Baseline Model**

In [None]:
y=data["fraudulent"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
model=GaussianNB()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [None]:
print(f"Accuracy score: {accuracy_score(y_test,y_pred)}")

In [None]:
print(f"classification_report:\n {classification_report(y_test,y_pred)}")

In [None]:
print(f"confusion_matrix:\n {confusion_matrix(y_test,y_pred)}")

In [None]:
cf_matrix=confusion_matrix(y_test,y_pred)

In [None]:
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot = True)

**Xgboost**

**General Approach for Parameter Tuning**

* Choose a relatively high learning rate. Generally a learning rate of 0.1 works but somewhere between 0.05 to 0.3 should work for different problems. Determine the optimum number of trees for this learning rate. 
* Tune tree-specific parameters ( max_depth, min_child_weight, gamma, subsample, colsample_bytree) for decided learning rate and number of trees. 
* Tune regularization parameters (lambda, alpha) for xgboost which can help reduce model complexity and enhance performance.
* Lower the learning rate and decide the optimal parameters 

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

**Hyperparameter tuning**

In [None]:
model=XGBClassifier( learning_rate =0.1,n_estimators=1000,max_depth=5,min_child_weight=1,
                     gamma=0,subsample=0.8,colsample_bytree=0.8,objective= 'binary:logistic',
                     nthread=4, eval_metric="logloss", use_label_encoder=False)

In [None]:
xgb_param = model.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)

early_stopping_rounds=50

cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], 
                  nfold=3,metrics='auc', early_stopping_rounds=early_stopping_rounds)

model.set_params(n_estimators=cvresult.shape[0])

In [None]:
cvresult.shape[0]

In [None]:
esti=cvresult.shape[0]

**Tree estimator Optimization**

In [None]:
param_test1 = {"max_depth":range(3,10,2),
               "min_child_weight":range(1,6,2)}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=esti, gamma=0, 
                        subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic',
                        eval_metric="logloss", nthread=4, seed=27,use_label_encoder=False),
                        param_grid = param_test1,n_jobs=2, cv=3)

gsearch1.fit(X_train,y_train)

In [None]:
gsearch1.best_params_

In [None]:
best_param1=gsearch1.best_params_

In [None]:
gsearch1.best_score_

In [None]:
param_test2 = {"subsample":[i/10 for i in range(5,9)],
               "colsample_bytree":[i/10 for i in range(5,9)]}

gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=esti,
                        objective= 'binary:logistic',eval_metric="logloss", 
                        nthread=4, seed=27,use_label_encoder=False, **best_param1),
                        param_grid = param_test2,n_jobs=2, cv=3)

gsearch2.fit(X_train,y_train)

In [None]:
gsearch2.best_params_

In [None]:
best_param2=gsearch2.best_params_

In [None]:
gsearch2.best_score_

In [None]:
best_param2={**best_param1,**best_param2}

In [None]:
param_test3 = {"gamma":range(1,6,2),
               "reg_alpha":[i/10 for i in range(1,5)]}

gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=esti,
                        objective= 'binary:logistic',eval_metric="logloss", nthread=4, 
                        seed=27,use_label_encoder=False,**best_param2),
                        param_grid = param_test3,n_jobs=2, cv=3)

gsearch3.fit(X_train,y_train)

In [None]:
gsearch3.best_params_

In [None]:
best_param3=gsearch3.best_params_

In [None]:
gsearch3.best_score_

In [None]:
best_param3={**best_param2,**best_param3}

In [None]:
model=XGBClassifier( learning_rate =0.01,n_estimators=5000,objective= 'binary:logistic',
                     nthread=4, eval_metric="logloss", use_label_encoder=False,**best_param3)

In [None]:
xgb_param = model.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)

early_stopping_rounds=50

cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], 
                  nfold=3,metrics='auc', early_stopping_rounds=early_stopping_rounds)

model.set_params(n_estimators=cvresult.shape[0])
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
print(f"Accuracy score: {accuracy_score(y_test,y_pred)}")

In [None]:
print(f"classification_report:\n {classification_report(y_test,y_pred)}")

In [None]:
print(f"confusion_matrix:\n {confusion_matrix(y_test,y_pred)}")

In [None]:
cf_matrix=confusion_matrix(y_test,y_pred)

In [None]:
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot = True)