## Design
In this exploration, the goal is to refine the input data by combining and vectorizing multiple string fields, and adding in the categorical attributes. Then, the plan is to compare resampling techniques and different models with the improved data.

In [1]:
# importing modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# reading in the initial dataset
fake_job_df = pd.read_csv("fake_job_postings.csv")
fake_job_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [3]:
# vectorizing the data as a combination of multiple string fields
vectorizer = CountVectorizer(max_features=500)  # Top 5000 words
X = vectorizer.fit_transform(fake_job_df["description"].fillna("") + 
                             fake_job_df["company_profile"].fillna("") +
                             fake_job_df["requirements"].fillna(""))
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [4]:
# adding in raw categorical columns
df = right=fake_job_df[["has_company_logo", "has_questions", "telecommuting", "fraudulent"]]
df = df.merge(right=X, left_index=True, right_index=True)
df.head()

Unnamed: 0,has_company_logo,has_questions,telecommuting,fraudulent,10,30,ability,able,about,access,...,work,working,world,would,writing,written,year,years,you,your
0,1,0,0,0,0,0,0,0,1,0,...,2,2,0,0,0,0,0,0,0,0
1,1,0,0,0,0,2,1,1,2,0,...,3,3,3,0,0,0,0,0,3,2
2,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,3,2
3,1,0,0,0,0,0,1,0,0,0,...,2,0,4,0,0,1,1,1,3,3
4,1,1,0,0,0,0,0,1,0,0,...,1,1,0,0,0,0,0,1,1,3


In [5]:
# converting easily available multiclass attributes to dummy variables
df = pd.merge(right=fake_job_df[["employment_type", "required_experience", "required_education"]],
              left=df, left_index=True, right_index=True)
df = pd.get_dummies(df, columns=["employment_type", "required_experience", "required_education"], dtype=int)
df.head()

Unnamed: 0,has_company_logo,has_questions,telecommuting,fraudulent,10,30,ability,able,about,access,...,required_education_Doctorate,required_education_High School or equivalent,required_education_Master's Degree,required_education_Professional,required_education_Some College Coursework Completed,required_education_Some High School Coursework,required_education_Unspecified,required_education_Vocational,required_education_Vocational - Degree,required_education_Vocational - HS Diploma
0,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,2,1,1,2,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# dealing with remaining columns, by doing a simple check if they're null or not
has_columns = ["location", "company_profile", "salary_range", "benefits"]
for col in has_columns:
    df[f"has_{col}"] = fake_job_df[col].notnull().astype(int)

df.head()

Unnamed: 0,has_company_logo,has_questions,telecommuting,fraudulent,10,30,ability,able,about,access,...,required_education_Some College Coursework Completed,required_education_Some High School Coursework,required_education_Unspecified,required_education_Vocational,required_education_Vocational - Degree,required_education_Vocational - HS Diploma,has_location,has_company_profile,has_salary_range,has_benefits
0,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
1,1,0,0,0,0,2,1,1,2,0,...,0,0,0,0,0,0,1,1,0,1
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,1
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,1


In [7]:
# saving the final data
df.to_csv("fake_jobs_processed.csv")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Columns: 533 entries, has_company_logo to has_benefits
dtypes: int64(533)
memory usage: 72.7 MB
