In [3]:
#train_model.py: Script to train the AI JobShield model
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

#load the data
df = pd.read_csv('../data/fake_job_postings.csv')
text_cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']
for col in text_cols:
    df[col] = df[col].fillna('')
df['text'] = df['title'] + ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits']
X = df['text']
y = df['fraudulent']

#split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

#define and train the pipeline here
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.9)),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
pipeline.fit(X_train, y_train)

#evaluate the model
y_pred = pipeline.predict(X_test)
print("Accuracy: %.3f" % accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

#save the trained model pipeline
joblib.dump(pipeline, '../models/job_posting_model.pkl')


Accuracy: 0.978
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3403
           1       0.73      0.89      0.80       173

    accuracy                           0.98      3576
   macro avg       0.86      0.94      0.89      3576
weighted avg       0.98      0.98      0.98      3576



['../models/job_posting_model.pkl']