In [1]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
def preprocess_data(df):
    df = df.drop(columns=['salary_range', 'department'], errors='ignore')
    text_fields = ['description', 'company_profile', 'requirements', 'benefits']
    for field in text_fields:
        df[field] = df[field].fillna('')
    for field in text_fields:
        df[field] = df[field].str.lower()
        df[field] = df[field].str.replace(r'[^\w\s]', '', regex=True)
    
    print("Data preprocessing completed.")
    return df

In [5]:
def build_pipeline():
    # Text vectorization with TF-IDF
    text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=1000))
    ])
    
    # Preprocessing pipeline for columns
    preprocessor = ColumnTransformer(transformers=[
        ('description', text_transformer, 'description'),
        ('company_profile', text_transformer, 'company_profile'),
        ('requirements', text_transformer, 'requirements')
    ])
    
    # Full pipeline with a Random Forest classifier
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    print("Pipeline created.")
    return pipeline

In [7]:
def train_and_save_model(data_path, model_path='fake_job_model.pkl'):
    # Load dataset
    data = pd.read_csv(data_path)
    print("Dataset loaded.")
    
    # Preprocess dataset
    data = preprocess_data(data)
    
    # Features (X) and target (y)
    X = data[['description', 'company_profile', 'requirements']]
    y = data['fraudulent']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Data split into training and testing sets.")
    
    # Build pipeline
    pipeline = build_pipeline()
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    print("Model training completed.")
    
    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    # Save the trained pipeline
    joblib.dump(pipeline, model_path)
    print(f"Model saved successfully at {model_path}")

In [10]:
if __name__ == "__main__":
    train_and_save_model(r"C:\Users\diya\fake_job_postings.csv", 'fake_job_model.pkl')

Dataset loaded.
Data preprocessing completed.
Data split into training and testing sets.
Pipeline created.
Model training completed.
Accuracy: 0.979586129753915
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3395
           1       0.97      0.61      0.75       181

    accuracy                           0.98      3576
   macro avg       0.98      0.81      0.87      3576
weighted avg       0.98      0.98      0.98      3576

Model saved successfully at fake_job_model.pkl
