In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import os

# Load preprocessed data (assumes it's been cleaned already)
data_path = os.path.join("Resources", "fake_job_postings.csv")
df = pd.read_csv(data_path)

# Combine relevant text fields into one
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
df['text'] = df[text_columns].fillna('').agg(' '.join, axis=1)

# Define features and target
X = df['text']
y = df['fraudulent']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# GridSearchCV for model optimization
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
grid_search.fit(X_train_vec, y_train)

# Save optimization results to CSV
results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df[['params', 'mean_test_score', 'std_test_score']]
results_df.to_csv("model_optimization_results.csv", index=False)

# Use best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_vec)

# Evaluate best model
print("Best Parameters:", grid_search.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature importances
importances = best_model.feature_importances_
feature_names = vectorizer.get_feature_names_out()
top_indices = importances.argsort()[-10:][::-1]

plt.figure(figsize=(10, 6))
plt.barh(range(len(top_indices)), importances[top_indices], align='center')
plt.yticks(range(len(top_indices)), [feature_names[i] for i in top_indices])
plt.xlabel("Feature Importance")
plt.title("Top 10 Important Features")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
