In [2]:
import pandas as pd

train_df = pd.read_parquet("train.parquet")
val_df   = pd.read_parquet("validation.parquet")
test_df  = pd.read_parquet("test.parquet")

train_df.shape, val_df.shape, test_df.shape

((10816, 6), (1909, 6), (3182, 6))

In [4]:
train_df.head()

Unnamed: 0,title,salary_range,description,required_experience,required_education,fraudulent
0,Electrical and Instrumentation Maintenance Tec...,,Maintenance Electrical/Instrumentation Technic...,Entry level,Vocational,0
1,Product Designer,,"Mossio is growing! As a result, we’re looking ...",Mid-Senior level,Unspecified,0
2,CSR Workshop Coordinator,0-0,Job briefWe are looking for a pofessional Trai...,Internship,Bachelor's Degree,0
3,Senior Supply Network Planner,,Supply network planning; being responsible for...,,,0
4,C# Developer,,We are looking for a C# developer who has star...,Associate,Unspecified,0


In [6]:
import pandas as pd

def add_all_text(df):
    df = df.copy()
    df["all_text"] = (
        df[["title", "description", "required_experience", "required_education"]]
        .fillna("")
        .astype(str)
        .agg(" ".join, axis=1)
    )
    return df

train_df = add_all_text(train_df)
val_df   = add_all_text(val_df)
test_df  = add_all_text(test_df)

TEXT_COL = "all_text"
LABEL_COL = "fraudulent"

In [20]:
import re

def clean_text(text):
    if text is None:
        return ""

    text = text.lower()
    text = re.sub(r"https?://\S+|www\.\S+", " URL ", text)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", " EMAIL ", text)
    text = re.sub(r"\b(\+?\d{1,3}[\s-]?)?(\(?\d{2,4}\)?[\s-]?)?\d{3,4}[\s-]?\d{3,4}\b", " PHONE ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

train_df = train_df.dropna(subset=[TEXT_COL, LABEL_COL])
val_df   = val_df.dropna(subset=[TEXT_COL, LABEL_COL])
test_df  = test_df.dropna(subset=[TEXT_COL, LABEL_COL])

train_df["clean_text"] = train_df[TEXT_COL].apply(clean_text)
val_df["clean_text"]   = val_df[TEXT_COL].apply(clean_text)
test_df["clean_text"]  = test_df[TEXT_COL].apply(clean_text)

train_df[LABEL_COL] = train_df[LABEL_COL].astype(int)
val_df[LABEL_COL]   = val_df[LABEL_COL].astype(int)
test_df[LABEL_COL]  = test_df[LABEL_COL].astype(int)

train_df[LABEL_COL].value_counts()

fraudulent
0    10348
1      468
Name: count, dtype: int64

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_train = tfidf.fit_transform(train_df["clean_text"])
y_train = train_df[LABEL_COL]

X_val = tfidf.transform(val_df["clean_text"])
y_val = val_df[LABEL_COL]

X_test = tfidf.transform(test_df["clean_text"])
y_test = test_df[LABEL_COL]

X_train.shape, X_val.shape, X_test.shape

((10816, 193147), (1909, 193147), (3182, 193147))

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

models = {
    "Logistic_Regression": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "Decision_Tree": DecisionTreeClassifier(max_depth=40, min_samples_leaf=2),
    "KNN": KNeighborsClassifier(n_neighbors=5, metric="cosine")
}

for name, model in models.items():
    print(f"\nReport of {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))


Report of Logistic_Regression
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3044
           1       0.65      0.72      0.69       138

    accuracy                           0.97      3182
   macro avg       0.82      0.85      0.84      3182
weighted avg       0.97      0.97      0.97      3182


Report of Decision_Tree
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3044
           1       0.75      0.57      0.65       138

    accuracy                           0.97      3182
   macro avg       0.87      0.78      0.82      3182
weighted avg       0.97      0.97      0.97      3182


Report of KNN
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3044
           1       0.83      0.56      0.67       138

    accuracy                           0.98      3182
   macro avg       0.90      0.78      0.83      3182
wei

In [35]:
from sklearn.metrics import classification_report

for name, model in models.items():
    print(f"\nValidation Report of {name}")
    y_val_pred = model.predict(X_val)
    print(classification_report(y_val, y_val_pred))


Validation Report of Logistic_Regression
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1826
           1       0.56      0.64      0.60        83

    accuracy                           0.96      1909
   macro avg       0.77      0.81      0.79      1909
weighted avg       0.96      0.96      0.96      1909


Validation Report of Decision_Tree
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1826
           1       0.65      0.45      0.53        83

    accuracy                           0.97      1909
   macro avg       0.81      0.72      0.76      1909
weighted avg       0.96      0.97      0.96      1909


Validation Report of KNN
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1826
           1       0.84      0.49      0.62        83

    accuracy                           0.97      1909
   macro avg       0.91 