In [None]:

# Job Posting Prediction (Real or Fake)

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

# Load the data once
df = pd.read_csv('fake_job_postings.csv')

# Check missing values
print(df.isnull().sum())

# Correlation heatmap for selected numeric features
numericpostings = ['job_id', 'telecommuting', 'has_company_logo', 'has_questions', 'fraudulent']
new_df = df[numericpostings].copy()
corr = new_df.corr()
sns.heatmap(corr, annot=True)
plt.show()

# Extract state and plot fraudulent jobs by state
df['state'] = df['location'].str.extract(',\s*([A-Z]{2}),')
fraudulent_jobs = df[df['fraudulent'] == 1]
state_fraud_count = fraudulent_jobs['state'].value_counts()

# Plot the results
state_fraud_count.sort_values(ascending=False).plot(kind='bar', figsize=(14, 6), color='red')
plt.title('Fake Job Postings by State')
plt.xlabel('State')
plt.ylabel('Number of Fake Job Postings')
plt.xticks(rotation=45)
plt.show()
