In [1]:

!pip install pandas numpy matplotlib seaborn nltk scikit-learn wordcloud




In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")


In [5]:

df = pd.read_csv("fake_job_postings.csv")


df.head()


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [11]:
missing_values = df.isna().sum()  
duplicates = df.duplicated().sum()

In [13]:
df['benefits'].fillna('Not specified', inplace=True)


print(df.dtypes)

job_id                  int64
title                  object
location               object
department             object
salary_range           object
company_profile        object
description            object
requirements           object
benefits               object
telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
function               object
fraudulent              int64
dtype: object


In [20]:

columns_to_clean = ['title', 'location', 'department', 'company_profile', 'description', 
                    'requirements', 'benefits', 'employment_type', 'industry', 'function.']


existing_columns = [col for col in columns_to_clean if col in df.columns]

df[existing_columns] = df[existing_columns].apply(lambda col: col.str.strip().str.lower())

df.head()


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,marketing intern,"us, ny, new york",marketing,,"we're food52, and we've created a groundbreaki...","food52, a fast-growing, james beard award-winn...",experience with content management systems a m...,not specified,0,1,0,other,Internship,,,Marketing,0
1,2,customer service - cloud video production,"nz, , auckland",success,,"90 seconds, the worlds cloud video production ...",organised - focused - vibrant - awesome!do you...,what we expect from you:your key responsibilit...,what you will get from usthrough being part of...,0,1,0,full-time,Not Applicable,,marketing and advertising,Customer Service,0
2,3,commissioning machinery assistant (cma),"us, ia, wever",,,valor services provides workforce solutions th...,"our client, located in houston, is actively se...",implement pre-commissioning and commissioning ...,not specified,0,1,0,,,,,,0
3,4,account executive - washington dc,"us, dc, washington",sales,,our passion for improving quality of life thro...,the company: esri – environmental systems rese...,"education: bachelor’s or master’s in gis, busi...",our culture is anything but corporate—we have ...,0,1,0,full-time,Mid-Senior level,Bachelor's Degree,computer software,Sales,0
4,5,bill review manager,"us, fl, fort worth",,,spotsource solutions llc is a global human cap...,job title: itemization review managerlocation:...,qualifications:rn license in the state of texa...,full benefits offered,0,1,1,full-time,Mid-Senior level,Bachelor's Degree,hospital & health care,Health Care Provider,0


In [22]:
df['salary_range_cleaned'] = df['salary_range'].str.replace(r'[^0-9-]', '', regex=True)


df['salary_range_cleaned'] = pd.to_numeric(df['salary_range_cleaned'], errors='coerce')


print(df['salary_range_cleaned'].describe())

count        6.000000
mean      6650.333333
std      16337.933611
min        -25.000000
25%        -20.000000
50%        -19.000000
75%        -15.750000
max      40000.000000
Name: salary_range_cleaned, dtype: float64


In [24]:
df['salary_range'] = df['salary_range'].str.replace(r'[^0-9]', '', regex=True).astype(float)

# Replace NaN values with 0
df['salary_range'].fillna(0, inplace=True)


df['salary_range'].describe()

count    1.788000e+04
mean     4.754252e+14
std      5.994477e+16
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      8.000000e+18
Name: salary_range, dtype: float64

In [26]:
threshold = df['salary_range'].quantile(0.95)

# Filter the DataFrame to keep only values within the threshold
df = df[df['salary_range'] <= threshold]

df['salary_range'].describe()

count    1.699700e+04
mean     2.745578e+08
std      9.557098e+08
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      5.500065e+09
Name: salary_range, dtype: float64

In [28]:
from sklearn.preprocessing import OneHotEncoder

# Convert all character (string) columns to categorical (factor equivalent)
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

# One-hot encode categorical variables
encoder = OneHotEncoder(drop=None, sparse=False)  # Drop=None keeps all categories
encoded_array = encoder.fit_transform(df.select_dtypes(include=['category']))

# Create a DataFrame with encoded features
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(df.select_dtypes(include=['category']).columns))

# Drop original categorical columns and concatenate encoded features
df_encoded = pd.concat([df.select_dtypes(exclude=['category']).reset_index(drop=True), encoded_df], axis=1)

df_encoded.head()

Unnamed: 0,job_id,salary_range,telecommuting,has_company_logo,has_questions,fraudulent,salary_range_cleaned,title_$950/week truck drivers needed,title_$950/week. truck drivers needed,title_(assistant) accountant,...,function_Purchasing,function_Quality Assurance,function_Research,function_Sales,function_Science,function_Strategy/Planning,function_Supply Chain,function_Training,function_Writing/Editing,function_nan
0,1,0.0,0,1,0,0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0,1,0,0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0,1,0,0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,0.0,0,1,0,0,,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0,1,1,0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
pip install streamlit


Note: you may need to restart the kernel to use updated packages.


In [36]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample training data
X_train = ["This is a fake job", "Legitimate job posting", "Urgent work from home offer!"]
y_train = [1, 0, 1]  # 1 = Fake, 0 = Real

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)

model = RandomForestClassifier()
model.fit(X_train_vectors, y_train)

# Save model and vectorizer
with open("fake_job_model.pkl", "wb") as f:
    pickle.dump(model, f)
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


In [38]:

# Sample training data (replace with real dataset)
X_train = [
    "Earn money fast! No experience needed. Apply now!",  # Fake job
    "We are looking for a software engineer with 3+ years of experience.",  # Real job
    "Make thousands of dollars working from home. Limited time offer!",  # Fake job
    "Hiring data analyst for full-time remote work. Apply with your resume.",  # Real job
]
y_train = [1, 0, 1, 0]  # 1 = Fake, 0 = Real

# **Step 1: Convert Text to Features**
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)

# **Step 2: Train the Model**
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vectors, y_train)

# **Step 3: Save the trained model and vectorizer**
with open("fake_job_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("✅ Model and vectorizer saved successfully!")


✅ Model and vectorizer saved successfully!


In [40]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


# Assuming 'description' contains job details and 'fraudulent' is the target (1 = Fake, 0 = Real)
df = df[['description', 'fraudulent']].dropna()  # Remove missing values

# **Step 3: Split Data into Features (X) and Labels (y)**
X = df["description"]  # Job descriptions
y = df["fraudulent"]   # Labels (1 = Fake, 0 = Real)

# **Step 4: Train-Test Split**
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Step 5: Convert Text to Features**
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vectors = vectorizer.fit_transform(X_train)

# **Step 6: Train the Model**
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vectors, y_train)

# **Step 7: Save Model & Vectorizer**
with open("fake_job_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("✅ Model and vectorizer trained and saved successfully!")


✅ Model and vectorizer trained and saved successfully!


In [46]:
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score, classification_report


df = pd.read_csv("fake_job_postings.csv")
df = df[['description', 'fraudulent']].dropna()

X = df["description"]
y = df["fraudulent"]

# **Step 2: Load Model & Vectorizer**
with open("fake_job_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

with open("tfidf_vectorizer.pkl", "rb") as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

# * Transform Text Data
X_vectors = vectorizer.transform(X)

#  Make Predictions
y_pred = model.predict(X_vectors)

# Evaluate Accuracy
accuracy = accuracy_score(y, y_pred)
print(f"✅ Model Accuracy: {accuracy:.2%}")

# **Step 6: Detailed Report**
print("\n🔹 Classification Report:")
print(classification_report(y, y_pred))


✅ Model Accuracy: 99.17%

🔹 Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     17014
           1       1.00      0.83      0.91       865

    accuracy                           0.99     17879
   macro avg       0.99      0.92      0.95     17879
weighted avg       0.99      0.99      0.99     17879



In [50]:
# Test with a Custom Job Description
job_posting = ["Work from home, earn $5000 weekly! No experience needed."]
job_vector = vectorizer.transform(job_posting)
prediction = model.predict(job_vector)[0]

if prediction == 1:
    print("🚨 Fake Job Detected!")
else:
    print("✅ Real Job Posting.")


✅ Real Job Posting.
