In [10]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import re
from IPython.display import display, Markdown
import ipywidgets as widgets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import streamlit as st

In [2]:
#Add the datasets

Fake_Job_Postings = pd.read_csv('/home/brandon/Fake Job/fake_job_postings.csv')
Job_Postings = pd.read_csv('/home/brandon/Fake Job/job_train.csv')

In [4]:
Fake_Job_Postings.head()


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [5]:
Job_Postings.head()

Unnamed: 0,title,location,description,requirements,telecommuting,has_company_logo,has_questions,fraudulent
0,Architect (Middleware - MQ) - Kuwait,"KW, KU,","On behalf of our client, a well known multinat...",-Working technical knowledge of IT systems and...,0,1,0,0
1,Interviewing Now for Sales Rep Positions -- wi...,"US, TX, Corpus Christi","We are Argenta Field Solutions, a rapidly expa...",#NAME?,0,1,0,0
2,Process Controls Staff Engineer - Foxboro I/A ...,"US, TX, USA Southwest",Experienced Process Controls Staff Engineer is...,At least 10 years of degreed professional expe...,0,0,0,0
3,Experienced Telemarketer Wanted - Digital Solu...,"AU, NSW,",If you have a passion for people and love to s...,"Responsibilities - Prospecting, following up a...",0,1,0,0
4,Senior Network Engineer,"GB, ENG, London",As the successful Senior Network Engineer you ...,Essential skills:•Juniper switching/routing/se...,0,1,0,0


In [6]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tech-specific indicators
tech_indicators = [
    'python', 'java', 'javascript', 'aws', 'azure', 'docker', 'kubernetes',
    'react', 'angular', 'vue', 'node.js', 'sql', 'nosql', 'tensorflow', 'pytorch',
    'scikit-learn', 'pandas', 'machine learning', 'deep learning', 'api', 'rest',
    'graphql', 'devops', 'agile', 'scrum', 'blockchain', 'cybersecurity', 'iot',
    'big data', 'hadoop', 'spark', 'git', 'jenkins', 'linux', 'bash'
]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def text_cleaning(text):
    """ Clean text by removing unwanted characters and formatting. """
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_text(texts):
    """ Preprocess text to be fed into BERT model. """
    texts = [text_cleaning(text) for text in texts]
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

def analyze(text):
    """ Analyze the job description to determine authenticity. """
    with torch.no_grad():
        inputs = preprocess_text([text])
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        fake_prob = probabilities[:, 1].item() * 100
        genuine_prob = probabilities[:, 0].item() * 100

    indicators_present = [word for word in tech_indicators if word in text_cleaning(text)]
    if fake_prob > 50:
        feedback = f"**Warning**: This job description is likely fake.\n- Check salary against industry norms.\n- Verify company existence.\n- Look for detailed technical requirements."
    else:
        feedback = f"**Success**: This job description is likely genuine.\nDetected keywords: {', '.join(indicators_present)}"
    
    results.value = f"**Probability of being Genuine**: {genuine_prob:.2f}%\n**Probability of being Fake**: {fake_prob:.2f}%\n\n{feedback}"

In [8]:
# Widgets
text_area = widgets.Textarea(
    value='',
    placeholder='Enter a tech job description',
    description='Text:',
    disabled=False
)
analyze_button = widgets.Button(description="Analyze")
results = widgets.Output()

# Button event
def on_button_clicked(b):
    with results:
        results.clear_output()
        if text_area.value:
            analyze(text_area.value)

analyze_button.on_click(on_button_clicked)

In [9]:
# Display widgets
display(Markdown("### Tech Job Ad Authenticity Checker"), text_area, analyze_button, results)

### Tech Job Ad Authenticity Checker

Textarea(value='', description='Text:', placeholder='Enter a tech job description')

Button(description='Analyze', style=ButtonStyle())

Output()

In [14]:
# Streamlit app setup
import asyncio

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tech-specific indicators
tech_indicators = [
    'python', 'java', 'javascript', 'aws', 'azure', 'docker', 'kubernetes',
    'react', 'angular', 'vue', 'node.js', 'sql', 'nosql', 'tensorflow', 'pytorch',
    'scikit-learn', 'pandas', 'machine learning', 'deep learning', 'api', 'rest',
    'graphql', 'devops', 'agile', 'scrum', 'blockchain', 'cybersecurity', 'iot',
    'big data', 'hadoop', 'spark', 'git', 'jenkins', 'linux', 'bash'
]

def text_cleaning(text):
    """ Clean text by removing unwanted characters and formatting. """
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_text(texts):
    """ Preprocess text to be fed into BERT model. """
    texts = [text_cleaning(text) for text in texts]
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

def analyze(text):
    """ Analyze the job description to determine authenticity. """
    inputs = preprocess_text([text])
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    fake_prob = probabilities[:, 1].item() * 100
    genuine_prob = probabilities[:, 0].item() * 100

    indicators_present = [word for word in tech_indicators if word in text_cleaning(text)]
    return genuine_prob, fake_prob, indicators_present

# Ensure asyncio loop does not crash in new threads
def run_asyncio_loop(loop):
    loop.run_forever()

# Streamlit app setup
st.title('Tech Job Ad Authenticity Checker')
text = st.text_area("Enter a tech job description:", height=300)

if st.button('Analyze'):
    if text:
        genuine_prob, fake_prob, indicators_present = analyze(text)
        st.metric("Probability of being Genuine", f"{genuine_prob:.2f}%")
        st.metric("Probability of being Fake", f"{fake_prob:.2f}%")

        if fake_prob > 50:
            st.error("Warning: This job description is likely fake. Key indicators to review:")
            st.write("- Check if salary is unusually high for listed requirements.")
            st.write("- Verify company existence through tech-specific databases.")
            st.write("- Absence of detailed technical requirements might indicate a scam.")
        else:
            st.success("This job description is likely genuine. Key tech terms detected:")
            st.write(f"- {', '.join(indicators_present)}")
    else:
        st.warning("Please enter a job description to analyze.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
#Run the streamlit app

#streamlit run /home/brandon/Fake Job/fake_job_postings.py

SyntaxError: invalid syntax (2160483556.py, line 3)