In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import html
import nltk
from nltk import word_tokenize, pos_tag

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuhao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('../datasets/emscad_v1.csv')

In [3]:
df.columns

Index(['title', 'location', 'department', 'salary_range', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'in_balanced_dataset'],
      dtype='object')

In [4]:
df.head(5)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset
0,Marketing Intern,"US, NY, New York",Marketing,,"<h3>We're Food52, and we've created a groundbr...","<p>Food52, a fast-growing, James Beard Award-w...",<ul>\r\n<li>Experience with content management...,,f,t,f,Other,Internship,,,Marketing,f,f
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"<h3>90 Seconds, the worlds Cloud Video Product...",<p>Organised - Focused - Vibrant - Awesome!<br...,<p><b>What we expect from you:</b></p>\r\n<p>Y...,<h3><b>What you will get from us</b></h3>\r\n<...,f,t,f,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,f,f
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,<h3></h3>\r\n<p>Valor Services provides Workfo...,"<p>Our client, located in Houston, is actively...",<ul>\r\n<li>Implement pre-commissioning and co...,,f,t,f,,,,,,f,f
3,Account Executive - Washington DC,"US, DC, Washington",Sales,,<p>Our passion for improving quality of life t...,<p><b>THE COMPANY: ESRI – Environmental System...,<ul>\r\n<li>\r\n<b>EDUCATION: </b>Bachelor’s o...,<p>Our culture is anything but corporate—we ha...,f,t,f,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,f,f
4,Bill Review Manager,"US, FL, Fort Worth",,,<p>SpotSource Solutions LLC is a Global Human ...,<p><b>JOB TITLE:</b> Itemization Review Manage...,<p><b>QUALIFICATIONS:</b></p>\r\n<ul>\r\n<li>R...,<p>Full Benefits Offered</p>,f,t,t,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,f,f


### Distribution of Null values for fraudulent against all dataset

In [5]:
# count all the columns that have null values
df.isnull().sum()[df.isnull().sum()>0].sort_values(ascending=False)

salary_range           15012
department             11547
required_education      8105
benefits                7196
required_experience     7050
function                6455
industry                4903
employment_type         3471
company_profile         3308
requirements            2689
location                 346
dtype: int64

In [6]:
# count all the columns that have null values that are fraudulent
df[df["fraudulent"]=="t"].isnull().sum()[df[df["fraudulent"]=="t"].isnull().sum()>0].sort_values(ascending=False)

salary_range           643
company_profile        587
department             531
required_education     451
required_experience    435
benefits               363
function               337
industry               275
employment_type        241
requirements           153
location                19
dtype: int64

In [None]:
# percentage difference of null values between fraudulent and total
fraud = df[df["fraudulent"]=="t"].isnull().sum()[df[df["fraudulent"]=="t"].isnull().sum()>0]
total = df.isnull().sum()[df.isnull().sum()>0]
f = (fraud/total*100).sort_values(ascending=False)
f

In [None]:
# plot a bar chart of the percentage difference of null values between fraudulent and total
# label each bar with the percentage
plt.figure(figsize=(10,5))
[plt.text(v, i, str(round(v,2))+"%") for i, v in enumerate(f)]
plt.barh(f.index, f)
plt.title("Percentage of null values in fraudulent out of total job postings")
plt.xlabel("Percentage")
plt.ylabel("Columns")
plt.show()


### Word Cloud for fraudulent and non-fraudulent transactions

In [None]:
fraud_df_desc = df[df["fraudulent"]=="t"]["description"]
fraud_df_desc

In [None]:
def remove_html_tags_and_escape_chars(input_text):
    # Remove HTML tags
    text_without_html = BeautifulSoup(input_text, 'html.parser').get_text()

    # Unescape HTML characters
    text_without_escape_chars = html.unescape(text_without_html)

    return text_without_escape_chars

fraud_df_desc = fraud_df_desc.apply(remove_html_tags_and_escape_chars)

In [None]:
fraud_df_desc

In [None]:
def remove_non_alpha(input_text):
    return ''.join(char if char.isalpha() or char.isspace() else ' ' for char in input_text)
def remove_newlines(text):
    return text.replace('\n', '').replace('\r', '')
fraud_df_desc = fraud_df_desc.apply(remove_non_alpha)
fraud_df_desc = fraud_df_desc.apply(remove_newlines)


In [None]:
fraud_df_desc = fraud_df_desc.str.lower()

In [None]:
fraud_df_desc =fraud_df_desc.apply(lambda x: word_tokenize(x.lower()))
fraud_df_desc = fraud_df_desc.apply(lambda x: [word for word in x if len(word) >= 3])
fraud_df_desc

In [None]:
fraud_df_desc

In [None]:
# put all the words into a string
fraud_words = ""
for i in fraud_df_desc:
    fraud_words += " ".join(i) + " "


In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(fraud_words)

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis labels
plt.show()

Uses Pos Tagging to further filter out words

In [None]:
# Perform part-of-speech tagging
def only_adj(x):
    pos_tags = pos_tag(x)
    return [word for word, pos in pos_tags if pos in ['JJ', 'JJR', 'JJS']]

In [None]:
df_adj = fraud_df_desc.apply(lambda x: only_adj(x))
df_adj = df_adj[df_adj.apply(lambda x: len(x) > 0)]

In [None]:
df_adj
# put all the words into a string
fraud_words = ""
for i in df_adj:
    fraud_words += " ".join(i) + " "


In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(fraud_words)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis labels
plt.show()

### Number of fraudulent job postings per state

In [None]:
df_loc = df[df["location"].notnull()][["location", "fraudulent"]]

In [None]:
df_loc['location'] = df_loc['location'].str.split(', ')
df_loc['location'] = df_loc['location'].apply(lambda x: x if len(x)==3 else np.nan)
df_loc = df_loc[df_loc['location'].notnull()]

In [None]:
df_loc[['Country', 'State', 'City']] = pd.DataFrame(df_loc['location'].tolist(), index=df_loc.index)

In [None]:
df_loc = df_loc[["Country","State", "City", "fraudulent"]]

In [None]:
# drop all rows with na
df_loc.replace('', np.nan, inplace=True)
df_loc = df_loc.dropna()

In [None]:
df_loc["fraudulent"] = df_loc["fraudulent"].apply(lambda x: 1 if x=="t" else 0)

In [None]:
# find the percentage of fraudulent job postings out of all in each state

fraudulent_counts = df_loc.groupby("State")["fraudulent"].sum().reset_index()

In [None]:
# plot a map using python's plotly library
# also show the number of fraudulent job postings per state
import plotly.express as px
fig = px.choropleth(fraudulent_counts, locations="State",
                    locationmode="USA-states",
                    color="fraudulent",
                    scope="usa",
                    title="Number of fraudulent job postings per state",
                    color_continuous_scale=px.colors.sequential.OrRd,
                    labels={'fraudulent':'Number of fraudulent job postings'},
                    template="plotly_white",
                    width=1000,
                    height=600)
# Add scatter points with text values
fig.add_trace(px.scatter_geo(fraudulent_counts,
                            locations='State',
                            text='fraudulent',
                            locationmode='USA-states',
                            color="fraudulent",
                            ).data[0])

fig.show()
