<a href="https://colab.research.google.com/github/Ashraf1292/Thesis-Fake_jobs-/blob/main/grok_version2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q nltk scikit-learn xgboost transformers shap tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd


print("Please upload your dataset file (emscad_dataset.csv)...")
data = pd.read_csv('fake_job_postings.csv')

# Display basic information about the dataset
print(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns")
print(f"Number of fraudulent jobs: {data['fraudulent'].sum()}")
print(f"Percentage of fraudulent jobs: {data['fraudulent'].mean()*100:.2f}%")


Please upload your dataset file (emscad_dataset.csv)...
Dataset loaded with 17880 rows and 18 columns
Number of fraudulent jobs: 866
Percentage of fraudulent jobs: 4.84%


In [3]:
# 2. Text Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import tqdm

# Download required NLTK resources with error handling
try:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    print("NLTK resources downloaded successfully")
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")
    # Alternative download method with SSL context
    import ssl
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Clean text by removing URLs, special characters, and normalizing."""
    if not isinstance(text, str):
        return ''
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase and strip whitespace
    text = text.lower().strip()

    # Simple word splitting instead of nltk tokenize to avoid potential issues
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]
    return ' '.join(words)

# Handle NaN values
data['description'] = data['description'].fillna('')
data['company_profile'] = data['company_profile'].fillna('')

# Apply text cleaning with progress tracking
print("Cleaning job descriptions...")
tqdm.pandas()
data['clean_description'] = data['description'].progress_apply(clean_text)
print("Cleaning company profiles...")
data['clean_company_profile'] = data['company_profile'].progress_apply(clean_text)
print("Text cleaning completed")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


NLTK resources downloaded successfully
Cleaning job descriptions...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  0%|          | 0/17880 [00:00<?, ?it/s]

Cleaning company profiles...


  0%|          | 0/17880 [00:00<?, ?it/s]

Text cleaning completed


In [4]:
# 3. Metadata Feature Engineering
def profile_completeness(profile):
    """Calculate completeness of company profile."""
    if not isinstance(profile, str) or len(profile.strip()) == 0:
        return 0.0
    return min(1.0, len(profile.split()) / 100.0)  # Normalize and cap at 1.0

data['profile_completeness'] = data['company_profile'].apply(profile_completeness)

# Industry fraud likelihood (based on training data)
industry_fraud_rate = data.groupby('industry')['fraudulent'].mean().to_dict()
data['industry_fraud_likelihood'] = data['industry'].map(industry_fraud_rate).fillna(0.0)


In [5]:
# 4. Feature Extraction
import numpy as np
import gc
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

print("Extracting TF-IDF features...")
# TF-IDF for description and company profile
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_desc = tfidf_vectorizer.fit_transform(data['clean_description'])
tfidf_profile = tfidf_vectorizer.transform(data['clean_company_profile'])
print(f"TF-IDF features extracted: {tfidf_desc.shape[1]} features")

# Check for available accelerators
use_gpu = tf.test.is_gpu_available() if hasattr(tf.test, 'is_gpu_available') else len(tf.config.list_physical_devices('GPU')) > 0
use_tpu = False

# Try to initialize TPU if available
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    use_tpu = True
    print(f"Running on TPU: {tpu.master()}")
except ValueError:
    # Fall back to GPU or CPU
    strategy = tf.distribute.get_strategy()
    print(f"TPU not available. Running on {'GPU' if use_gpu else 'CPU'}")

# Set BERT usage flag based on available hardware
use_bert = use_tpu or use_gpu  # Only use BERT if TPU or GPU is available
bert_embeddings = None

if use_bert:
    print("Using BERT for feature extraction")
    from transformers import AutoTokenizer, TFAutoModel
    from sklearn.decomposition import PCA

    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

    def get_bert_embeddings(texts, batch_size=16, max_length=128):
        """Generate DistilBERT embeddings with memory management and progress tracking."""
        embeddings = []
        total_batches = (len(texts) // batch_size) + (1 if len(texts) % batch_size else 0)

        # Create dataset for batching
        dataset = tf.data.Dataset.from_tensor_slices(texts).batch(batch_size)

        with strategy.scope():
            model = TFAutoModel.from_pretrained('distilbert-base-uncased')

        for i, batch_texts in enumerate(dataset):
            if i % 10 == 0:  # Print every 10 batches
                print(f"Processing BERT batch {i + 1}/{total_batches}")

            batch_texts = [text.decode('utf-8') if isinstance(text, bytes) else text
                         for text in batch_texts.numpy()]

            # Handle empty texts to prevent tokenizer errors
            batch_texts = [text if text.strip() else "empty" for text in batch_texts]

            try:
                inputs = tokenizer(batch_texts, return_tensors='tf', max_length=max_length,
                                  truncation=True, padding=True)
                outputs = model(inputs)
                batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
                embeddings.append(batch_embeddings)
            except Exception as e:
                print(f"Error in batch {i}: {e}")
                # Create zero embeddings as fallback
                batch_embeddings = np.zeros((len(batch_texts), model.config.dim))
                embeddings.append(batch_embeddings)

            # Clear memory
            if i % 20 == 0:  # Every 20 batches
                gc.collect()
                tf.keras.backend.clear_session()

        all_embeddings = np.vstack(embeddings)

        # Dimensionality reduction
        print("Performing PCA to reduce BERT dimensions...")
        pca = PCA(n_components=min(100, all_embeddings.shape[0], all_embeddings.shape[1]))
        reduced_embeddings = pca.fit_transform(all_embeddings)
        print(f"Reduced BERT embeddings from {all_embeddings.shape[1]} to {reduced_embeddings.shape[1]} dimensions")

        return reduced_embeddings

    try:
        # Get DistilBERT embeddings
        bert_embeddings = get_bert_embeddings(data['clean_description'].tolist(),
                                             batch_size=16 if use_tpu else 8,
                                             max_length=128)

        # Save embeddings to disk for future use
        import pickle
        with open('bert_embeddings.pkl', 'wb') as f:
            pickle.dump(bert_embeddings, f)
        print("BERT embeddings saved to disk")

    except Exception as e:
        print(f"Error generating BERT embeddings: {e}")
        print("Proceeding without BERT features")
        use_bert = False
        bert_embeddings = None
else:
    print("Skipping BERT feature extraction (no TPU/GPU available)")

# Combine features
print("Combining features...")
metadata_features = csr_matrix(data[['profile_completeness', 'industry_fraud_likelihood']].values)

if use_bert and bert_embeddings is not None:
    bert_sparse = csr_matrix(bert_embeddings)
    combined_features = hstack([tfidf_desc, tfidf_profile, bert_sparse, metadata_features])
    print(f"Combined features with BERT: {combined_features.shape}")
else:
    combined_features = hstack([tfidf_desc, tfidf_profile, metadata_features])
    print(f"Combined features without BERT: {combined_features.shape}")

Extracting TF-IDF features...


Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


TF-IDF features extracted: 5000 features
TPU not available. Running on CPU
Skipping BERT feature extraction (no TPU/GPU available)
Combining features...
Combined features without BERT: (17880, 10002)


In [6]:
# 5. Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    combined_features, data['fraudulent'],
    test_size=0.2, stratify=data['fraudulent'],
    random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 14304 samples
Test set: 3576 samples


In [7]:
# 6. Model Training
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import precision_recall_curve

# Calculate class weight based on data
pos_weight = (y_train.shape[0] - y_train.sum()) / y_train.sum()
print(f"Positive class weight: {pos_weight:.2f}")

# Base models with proper parameters
print("Training individual models...")
lr_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    solver='liblinear',  # Better for imbalanced data
    n_jobs=-1
)

xgb_model = XGBClassifier(
    scale_pos_weight=pos_weight,
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    max_depth=5,
    n_jobs=-1
)

# Random Forest can be computationally expensive - skip if too slow
use_rf = True
try:
    rf_model = RandomForestClassifier(
        class_weight='balanced',
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        max_depth=10  # Limit depth for faster training
    )
except Exception as e:
    print(f"Error with Random Forest: {e}")
    use_rf = False

# Train base models
print("Training Logistic Regression...")
lr_model.fit(X_train, y_train)
print("Training XGBoost...")
xgb_model.fit(X_train, y_train)

if use_rf:
    print("Training Random Forest...")
    rf_model.fit(X_train, y_train)

# Stacked Ensemble
print("Creating ensemble model...")
if use_rf:
    ensemble_model = VotingClassifier(
        estimators=[
            ('lr', lr_model),
            ('xgb', xgb_model),
            ('rf', rf_model)
        ],
        voting='soft'
    )
else:
    ensemble_model = VotingClassifier(
        estimators=[
            ('lr', lr_model),
            ('xgb', xgb_model)
        ],
        voting='soft'
    )

# Train ensemble
print("Training ensemble model...")
ensemble_model.fit(X_train, y_train)

# Threshold optimization
print("Optimizing decision threshold...")
y_scores = ensemble_model.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
optimal_idx = np.argmax(f1_scores[:-1])  # Skip last element (threshold=1.0)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold: {optimal_threshold:.4f}")

Positive class weight: 19.64
Training individual models...
Training Logistic Regression...




Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Random Forest...
Creating ensemble model...
Training ensemble model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Optimizing decision threshold...
Optimal threshold: 0.6680


In [8]:
# 7. Evaluation
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Function to print metrics as percentages
def print_metrics_as_percentage(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred) * 100
    precision = precision_score(y_true, y_pred) * 100
    recall = recall_score(y_true, y_pred) * 100
    f1 = f1_score(y_true, y_pred) * 100

    print(f"\n{model_name} Performance Metrics (%):")
    print(f"Accuracy:  {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}%")
    print(f"Recall:    {recall:.2f}%")
    print(f"F1 Score:  {f1:.2f}%")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Predict with optimal threshold
y_pred_ensemble = (ensemble_model.predict_proba(X_test)[:, 1] >= optimal_threshold).astype(int)
y_pred_lr = lr_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test) if use_rf else None

# Evaluate all models
print("\n=== MODEL EVALUATIONS ===")
print("\nLogistic Regression Results:")
print(classification_report(y_test, y_pred_lr))
lr_metrics = print_metrics_as_percentage(y_test, y_pred_lr, "Logistic Regression")

print("\nXGBoost Results:")
print(classification_report(y_test, y_pred_xgb))
xgb_metrics = print_metrics_as_percentage(y_test, y_pred_xgb, "XGBoost")

if use_rf:
    print("\nRandom Forest Results:")
    print(classification_report(y_test, y_pred_rf))
    rf_metrics = print_metrics_as_percentage(y_test, y_pred_rf, "Random Forest")

print("\nStacked Ensemble Results with Optimized Threshold:")
print(classification_report(y_test, y_pred_ensemble))
ensemble_metrics = print_metrics_as_percentage(y_test, y_pred_ensemble, "Ensemble (optimized)")



=== MODEL EVALUATIONS ===

Logistic Regression Results:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      3403
           1       0.59      0.92      0.72       173

    accuracy                           0.97      3576
   macro avg       0.79      0.95      0.85      3576
weighted avg       0.98      0.97      0.97      3576


Logistic Regression Performance Metrics (%):
Accuracy:  96.53%
Precision: 59.04%
Recall:    92.49%
F1 Score:  72.07%

XGBoost Results:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3403
           1       0.74      0.83      0.78       173

    accuracy                           0.98      3576
   macro avg       0.86      0.91      0.88      3576
weighted avg       0.98      0.98      0.98      3576


XGBoost Performance Metrics (%):
Accuracy:  97.73%
Precision: 73.71%
Recall:    82.66%
F1 Score:  77.93%

Random Forest Results:
              precision