In [14]:
import numpy as np
import pandas as pd
import re
import pickle
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [15]:
fake_path = Path("../data/fake_internships.csv")
real_path = Path("../data/real_internships.csv")

fake = pd.read_csv(fake_path)
real = pd.read_csv(real_path)

print("Fake samples:", len(fake))
print("Real samples:", len(real))


Fake samples: 69
Real samples: 64


In [16]:

TEXT_COLUMN = "text"   # ðŸ‘ˆ adjust once and freeze forever

fake = fake.rename(columns={fake.columns[0]: TEXT_COLUMN})
real = real.rename(columns={real.columns[0]: TEXT_COLUMN})

fake["label"] = 1
real["label"] = 0


In [17]:
data = pd.concat([fake, real], ignore_index=True)

def basic_clean(text):
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

data[TEXT_COLUMN] = data[TEXT_COLUMN].apply(basic_clean)

X = data[TEXT_COLUMN]
y = data["label"]

print(data["label"].value_counts())


label
1    69
0    64
Name: count, dtype: int64


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [19]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [20]:
model = LogisticRegression(
    max_iter=1000,
    random_state=42
)

model.fit(X_train_vec, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [21]:
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9259259259259259
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.88      1.00      0.93        14

    accuracy                           0.93        27
   macro avg       0.94      0.92      0.93        27
weighted avg       0.94      0.93      0.93        27



In [22]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("Model and vectorizer saved successfully")


Model and vectorizer saved successfully


In [23]:
import pandas as pd

df = pd.read_csv("../fake_job_postings.csv")

print(df.shape)
df.head()

(17880, 18)


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI â€“ Environmental Systems Rese...,"EDUCATION:Â Bachelorâ€™s or Masterâ€™s in GIS, busi...",Our culture is anything but corporateâ€”we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [24]:
use_cols = ["title", "company_profile", "description", "requirements", "benefits", "fraudulent"]
df = df[use_cols].fillna("")
print(df.shape)
df.head(2)

(17880, 6)


Unnamed: 0,title,company_profile,description,requirements,benefits,fraudulent
0,Marketing Intern,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0
1,Customer Service - Cloud Video Production,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0


In [25]:
df["text"] = (
    df["title"] + " " +
    df["company_profile"] + " " +
    df["description"] + " " +
    df["requirements"] + " " +
    df["benefits"]
)

df = df[["text", "fraudulent"]]
df.head(2)

Unnamed: 0,text,fraudulent
0,"Marketing Intern We're Food52, and we've creat...",0
1,Customer Service - Cloud Video Production 90 S...,0


In [26]:
import re

def clean_text(t: str) -> str:
    t = str(t).lower()
    t = re.sub(r"(https?://\S+|www\.\S+)", " URL ", t)
    t = re.sub(r"[^a-z0-9â‚¹\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["text"] = df["text"].apply(clean_text)
df["text"].str.len().describe()

count    17880.000000
mean      2595.592450
std       1421.144527
min         14.000000
25%       1555.000000
50%       2457.000000
75%       3387.250000
max      14599.000000
Name: text, dtype: float64

In [27]:
from sklearn.model_selection import train_test_split

X = df["text"]
y = df["fraudulent"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", len(X_train), "Test:", len(X_test))
print("Label balance (train):\n", y_train.value_counts(normalize=True))


Train: 14304 Test: 3576
Label balance (train):
 fraudulent
0    0.951552
1    0.048448
Name: proportion, dtype: float64


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    min_df=2
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec.shape, X_test_vec.shape

((14304, 20000), (3576, 20000))

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

models = {
    "LogisticRegression_balanced": LogisticRegression(max_iter=600, class_weight="balanced"),
    "NaiveBayes": MultinomialNB(),
    "LinearSVC_balanced": LinearSVC(class_weight="balanced")
}

results = []

for name, m in models.items():
    m.fit(X_train_vec, y_train)
    pred = m.predict(X_test_vec)
    acc = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    results.append((name, acc, f1))

results_df = pd.DataFrame(results, columns=["model", "accuracy", "f1"])
results_df.sort_values(by="f1", ascending=False)


Unnamed: 0,model,accuracy,f1
2,LinearSVC_balanced,0.989374,0.886228
0,LogisticRegression_balanced,0.977349,0.794937
1,NaiveBayes,0.966163,0.485106


In [30]:
from sklearn.metrics import classification_report, confusion_matrix

best_name = results_df.sort_values("f1", ascending=False).iloc[0]["model"]
print("Best model:", best_name)

best_model = models[best_name]
pred = best_model.predict(X_test_vec)

print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred))

Best model: LinearSVC_balanced
Confusion Matrix:
 [[3390   13]
 [  25  148]]

Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3403
           1       0.92      0.86      0.89       173

    accuracy                           0.99      3576
   macro avg       0.96      0.93      0.94      3576
weighted avg       0.99      0.99      0.99      3576



In [31]:
import joblib
from pathlib import Path

Path(".").mkdir(exist_ok=True)  # already in ml/

joblib.dump(best_model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("Saved: ml/model.pkl and ml/vectorizer.pkl âœ…")

Saved: ml/model.pkl and ml/vectorizer.pkl âœ…


In [32]:
def predict_risk(text: str):
    t = clean_text(text)
    vec = vectorizer.transform([t])

    if hasattr(best_model, "predict_proba"):
        p = float(best_model.predict_proba(vec)[0, 1])
    else:
        # For LinearSVC, convert decision score to 0â€“1-ish via a simple scaling
        score = float(best_model.decision_function(vec)[0])
        p = 1 / (1 + np.exp(-score))  # sigmoid
    return p

samples = [
    "Pay â‚¹1999 registration fee today via UPI to confirm your internship.",
    "Apply through our official careers portal. Interview rounds will follow."
]

for s in samples:
    print(round(predict_risk(s), 3), "|", s)

0.449 | Pay â‚¹1999 registration fee today via UPI to confirm your internship.
0.345 | Apply through our official careers portal. Interview rounds will follow.
