In [None]:
# Google Drive mounting removed
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer # Importing TfidfVectorizer
from imblearn.over_sampling import SMOTE # Importing SMOTE


In [None]:
datasets = [
    pd.read_csv('/content/drive/MyDrive/Fake_Postings.csv'),
    pd.read_csv('/content/drive/MyDrive/Pakistan_Job_Postings.csv'),
    pd.read_csv('/content/drive/MyDrive/Job_Title_Des.csv')
]

In [None]:
datasets[1]['fraudulent'] = 0
datasets[2]['fraudulent'] = 0


df = pd.concat(datasets, ignore_index=True)
df = df[['title', 'description', 'fraudulent']].dropna()
df['text'] = df['title'] + ' ' + df['description']
df['text_length'] = df['text'].apply(lambda x: len(x.split()))
df = df[df['text_length'] > 5]
X, y = df['text'], df['fraudulent']

In [None]:
print(datasets)

[                                             title  \
0                              Mental health nurse   
1                        Conference centre manager   
2                                   Engineer, land   
3                          Forest/woodland manager   
4     Production designer, theatre/television/film   
...                                            ...   
9995                           Designer, furniture   
9996                Therapist, speech and language   
9997                             Therapist, sports   
9998                   Clinical research associate   
9999                           Hospital pharmacist   

                                            description  \
0     Arm drive court sure vote. Earn $5000/week! Im...   
1     Government whom its bed go tax tree black. Ear...   
2     I member discuss follow way there nation. Earn...   
3     House across wait approach face. Earn $5000/we...   
4     Case best environmental full finally leader me...

In [None]:
class_counts = df['fraudulent'].value_counts()
print("\nClass Distribution:\n", class_counts)


Class Distribution:
 fraudulent
1    10000
0     8957
Name: count, dtype: int64


In [None]:
imbalance_ratio = class_counts.max() / class_counts.min()
print(f"\nImbalance Ratio: {imbalance_ratio:.2f}")


Imbalance Ratio: 1.12


In [None]:
if imbalance_ratio > 1.5:
    print("\nApplying SMOTE to balance classes...")
    vectorizer = TfidfVectorizer(max_features=5000)
    X_tfidf = vectorizer.fit_transform(X)
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)
    X_resampled_text = X.iloc[X_resampled.indices]
    print("SMOTE applied successfully!")
else:
    print("No significant class imbalance detected. Proceeding without SMOTE.")
    X_resampled_text = X
    y_resampled = y

No significant class imbalance detected. Proceeding without SMOTE.


In [None]:

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Ensure X_resampled_text exists (Convert to list if necessary)
if isinstance(X_resampled_text, pd.Series):
    X_resampled_text = X_resampled_text.tolist()
tokenized_inputs = tokenizer(
    list(X_resampled_text),
    padding="longest",
    truncation=True,
    return_tensors="pt"
)
print(f"\nTokenized Input IDs Shape: {tokenized_inputs['input_ids'].shape}")
print(f"Sample Tokenized Output:\n{tokenized_inputs['input_ids'][0]}")

NameError: name 'RobertaTokenizer' is not defined

In [None]:
from sklearn.preprocessing import LabelEncoder
y_encoder = LabelEncoder()
y_labels = torch.tensor(y_encoder.fit_transform(y_resampled), dtype=torch.long)


In [None]:
 y_tensor = torch.tensor(y_resampled.values, dtype=torch.long)

In [None]:
dataset = JobPostingDataset(tokenized_inputs, y_tensor)
print(f"Total samples in dataset: {len(dataset)}")
print(f"Sample data: {dataset[0]}")

Total samples in dataset: 18957
Sample data: {'input_ids': tensor([    0,   448, 13589,   474,  9008, 10617,  1305,   461,   686,   900,
            4,  7535,    68, 31830,    73,  3583,   328,  5902, 30771,  5947,
            4,  4493,   122,    23, 44009,  2518,  1039, 14551,     4,   175,
            4,     2,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
     

In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_set, val_set = torch.utils.data.random_split(dataset, [train_size, val_size])

In [None]:


class RoBERTaBiLSTM(nn.Module):
    def __init__(self, roberta_model='roberta-base', hidden_dim=256, num_classes=2, dropout=0.3):
        super(RoBERTaBiLSTM, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model)
        self.lstm = nn.LSTM(input_size=768, hidden_size=hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask, labels=None):  # Added labels argument
        with torch.no_grad():
            roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        lstm_input = roberta_output.last_hidden_state
        lstm_out, _ = self.lstm(lstm_input)
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(self.dropout(lstm_out))

        loss = None
        if labels is not None:  # Calculate loss if labels are provided
            loss_fct = nn.CrossEntropyLoss()  # Using CrossEntropyLoss for classification
            loss = loss_fct(output, labels)

        return (loss, output) if loss is not None else output




In [None]:

from transformers import Trainer, TrainingArguments
model = RoBERTaBiLSTM()
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=500,
    save_steps=1000,
    report_to="none",
    learning_rate= 5e-5,
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")

        outputs = model(**inputs)

        logits = outputs[1]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set
)

In [None]:
!pip install wandb
import os
os.environ["WANDB_DISABLED"] = "true"



In [None]:
from transformers import Trainer, TrainingArguments
model = RoBERTaBiLSTM()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0029,0.020066
2,0.0,3.2e-05
3,0.0,1.5e-05


TrainOutput(global_step=11376, training_loss=0.014734162980094277, metrics={'train_runtime': 2151.0782, 'train_samples_per_second': 21.15, 'train_steps_per_second': 5.289, 'total_flos': 0.0, 'train_loss': 0.014734162980094277, 'epoch': 3.0})

In [None]:
os.makedirs('/content/drive/MyDrive/model', exist_ok=True)
torch.save(model.state_dict(), '/content/drive/MyDrive/model/model_state_dict.pth')
tokenizer.save_pretrained('/content/drive/MyDrive/model')

('/content/drive/MyDrive/model/tokenizer_config.json',
 '/content/drive/MyDrive/model/special_tokens_map.json',
 '/content/drive/MyDrive/model/vocab.json',
 '/content/drive/MyDrive/model/merges.txt',
 '/content/drive/MyDrive/model/added_tokens.json')

In [None]:
  print("\nEvaluating the model on validation data...")
  eval_results = trainer.evaluate()
  print(eval_results)


Evaluating the model on validation data...


{'eval_loss': 1.530390181869734e-05, 'eval_runtime': 118.8069, 'eval_samples_per_second': 31.917, 'eval_steps_per_second': 7.979, 'epoch': 3.0}


In [None]:
if trainer.state.log_history and 'loss' in trainer.state.log_history[-1]:
    training_loss = trainer.state.log_history[-1]['loss']
else:
    training_loss = None
validation_loss = eval_results.get('eval_loss', None)
print(f"Training Loss: {training_loss:.4f}" if training_loss is not None else "Training Loss not available")
print(f"Validation Loss: {validation_loss:.4f}" if validation_loss is not None else "Validation Loss not available")


Training Loss not available
Validation Loss: 0.0000


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np
import torch


predictions_output = trainer.predict(val_set)


logits = predictions_output.predictions
labels = predictions_output.label_ids


predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()

accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000


In [None]:
%%writefile predictor1.py
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaModel

class RoBERTa_LSTM(nn.Module):
    def __init__(self, roberta_model, hidden_dim=256, num_labels=2):
        super(RoBERTa_LSTM, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model)
        self.lstm = nn.LSTM(input_size=self.roberta.config.hidden_size, hidden_size=hidden_dim,
                            batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)
    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        lstm_output, _ = self.lstm(roberta_output.last_hidden_state)
        logits = self.fc(lstm_output[:, -1, :])
        return logits


# Load Model and Tokenizer
MODEL_PATH = "/content/drive/MyDrive/model/model_state_dict.pth"
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RoBERTa_LSTM("roberta-base")
model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device("cpu")))
model.eval()

def predict_job_fraud(job_description):
    inputs = tokenizer(job_description, truncation=True, padding=True, max_length=512, return_tensors="pt")

    with torch.no_grad():
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        probabilities = torch.nn.functional.softmax(outputs, dim=-1)

    predicted_label = torch.argmax(probabilities, dim=-1).item()
    confidence = probabilities.max().item()

    return "Fake Job" if predicted_label == 1 else "Real Job", confidence


Writing predictor1.py


In [None]:
job_posting = input("Enter the job description: ")

prediction, confidence = predict_job_fraud(job_posting)

print(f"\nPrediction: {prediction} (Confidence: {confidence:.2f})")

Enter the job description: Arm drive court sure vote. Earn $5000/week! Immediate hiring. Contact now at david27@gmail.com.

Prediction: Fake Job (Confidence: 1.00)


In [None]:
job_posting = input("Enter the job description: ")

prediction, confidence = predict_job_fraud(job_posting)

print(f"\nPrediction: {prediction} (Confidence: {confidence:.2f})")

Enter the job description: This job is a remote working opportunity. We are a small service company located in Texas. We specialize in piano tuning, moving, repair, and restoration.Job Benefits Ability to work from home with flexible scheduling, only 1-2 hours of fixed schedul

Prediction: Real Job (Confidence: 1.00)
