In [2]:
import pandas as pd
import numpy as np
import joblib
import faiss

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
df1 = pd.read_csv("datasets/fake_job_postings.csv")
df2 = pd.read_csv("datasets/Fake Postings.csv")

In [5]:
print("Dataset 1 shape:", df1.shape)
print("Dataset 2 shape:", df2.shape)

Dataset 1 shape: (17880, 18)
Dataset 2 shape: (10000, 10)


In [6]:
df1.columns = df1.columns.str.lower()
df2.columns = df2.columns.str.lower()

In [7]:
common_cols = list(set(df1.columns).intersection(set(df2.columns)))

In [8]:
df1 = df1[common_cols]
df2 = df2[common_cols]

In [9]:
df = pd.concat([df1, df2], ignore_index=True)

# Remove duplicates
df = df.drop_duplicates()

In [10]:
print("Final merged dataset shape:", df.shape)

Final merged dataset shape: (27566, 10)


In [11]:
print(df.columns)

Index(['requirements', 'benefits', 'company_profile', 'industry',
       'description', 'salary_range', 'fraudulent', 'location', 'title',
       'employment_type'],
      dtype='object')


In [12]:
text_cols = [
    "title",
    "company_profile",
    "description",
    "requirements",
    "benefits",
    "employment_type",
    "industry"
]

In [13]:
df[text_cols] = df[text_cols].fillna("")

In [14]:
df.head()

Unnamed: 0,requirements,benefits,company_profile,industry,description,salary_range,fraudulent,location,title,employment_type
0,Experience with content management systems a m...,,"We're Food52, and we've created a groundbreaki...",,"Food52, a fast-growing, James Beard Award-winn...",,0,"US, NY, New York",Marketing Intern,Other
1,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,"90 Seconds, the worlds Cloud Video Production ...",Marketing and Advertising,Organised - Focused - Vibrant - Awesome!Do you...,,0,"NZ, , Auckland",Customer Service - Cloud Video Production,Full-time
2,Implement pre-commissioning and commissioning ...,,Valor Services provides Workforce Solutions th...,,"Our client, located in Houston, is actively se...",,0,"US, IA, Wever",Commissioning Machinery Assistant (CMA),
3,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,Our passion for improving quality of life thro...,Computer Software,THE COMPANY: ESRI – Environmental Systems Rese...,,0,"US, DC, Washington",Account Executive - Washington DC,Full-time
4,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,SpotSource Solutions LLC is a Global Human Cap...,Hospital & Health Care,JOB TITLE: Itemization Review ManagerLOCATION:...,,0,"US, FL, Fort Worth",Bill Review Manager,Full-time


In [15]:
df["combined_text"] = df[text_cols].agg(" ".join, axis=1)

In [16]:
df.head()

Unnamed: 0,requirements,benefits,company_profile,industry,description,salary_range,fraudulent,location,title,employment_type,combined_text
0,Experience with content management systems a m...,,"We're Food52, and we've created a groundbreaki...",,"Food52, a fast-growing, James Beard Award-winn...",,0,"US, NY, New York",Marketing Intern,Other,"Marketing Intern We're Food52, and we've creat..."
1,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,"90 Seconds, the worlds Cloud Video Production ...",Marketing and Advertising,Organised - Focused - Vibrant - Awesome!Do you...,,0,"NZ, , Auckland",Customer Service - Cloud Video Production,Full-time,Customer Service - Cloud Video Production 90 S...
2,Implement pre-commissioning and commissioning ...,,Valor Services provides Workforce Solutions th...,,"Our client, located in Houston, is actively se...",,0,"US, IA, Wever",Commissioning Machinery Assistant (CMA),,Commissioning Machinery Assistant (CMA) Valor ...
3,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,Our passion for improving quality of life thro...,Computer Software,THE COMPANY: ESRI – Environmental Systems Rese...,,0,"US, DC, Washington",Account Executive - Washington DC,Full-time,Account Executive - Washington DC Our passion ...
4,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,SpotSource Solutions LLC is a Global Human Cap...,Hospital & Health Care,JOB TITLE: Itemization Review ManagerLOCATION:...,,0,"US, FL, Fort Worth",Bill Review Manager,Full-time,Bill Review Manager SpotSource Solutions LLC i...


In [17]:
df["label"] = df["fraudulent"].astype(int)

In [18]:
df.head(1)

Unnamed: 0,requirements,benefits,company_profile,industry,description,salary_range,fraudulent,location,title,employment_type,combined_text,label
0,Experience with content management systems a m...,,"We're Food52, and we've created a groundbreaki...",,"Food52, a fast-growing, James Beard Award-winn...",,0,"US, NY, New York",Marketing Intern,Other,"Marketing Intern We're Food52, and we've creat...",0


In [19]:
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [20]:
embeddings = embedder.encode(
    df["combined_text"].tolist(),
    show_progress_bar=True,
    batch_size=32
)


Batches:   0%|          | 0/862 [00:00<?, ?it/s]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    df["label"],
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

In [22]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [23]:
print("\nMODEL REPORT\n")
print(classification_report(y_test, clf.predict(X_test)))


MODEL REPORT

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3343
           1       0.98      0.95      0.96      2171

    accuracy                           0.97      5514
   macro avg       0.97      0.97      0.97      5514
weighted avg       0.97      0.97      0.97      5514



In [24]:
joblib.dump(clf, "job_scam_classifier.pkl")
joblib.dump(embedder, "text_embedder.pkl")

['text_embedder.pkl']

In [25]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [26]:
faiss.write_index(index, "job_memory.index")
joblib.dump(df[["combined_text", "label"]], "job_memory_meta.pkl")

print("\n ALL FINAL PKL FILES CREATED")


 ALL FINAL PKL FILES CREATED


In [27]:
# import numpy as np
# import torch
# import faiss
# from sentence_transformers import SentenceTransformer

# print("NumPy OK:", np.__version__)
# print("Torch OK:", torch.__version__)

# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# emb = model.encode(["test sentence"])
# print("Embedding shape:", emb.shape)
