In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load Dataset
df = pd.read_csv("Data_Job_Posting.csv")  # Update the file path



In [None]:
# Drop "Posted Date" as per request
df = df[['Job Title', 'Company', 'Rating', 'Reviews', 'Experience', 'Salary', 'Location', 'Description', 'Skills', 'Fradulent']]

# Handle missing values (Fill NA with a default value)
df.fillna({'Rating': 0, 'Reviews': 0, 'Experience': '0-1 yrs', 'Salary': 'Not Disclosed'}, inplace=True)

# Convert categorical data into numerical using Label Encoding
label_encoders = {}
for col in ['Job Title', 'Company', 'Experience', 'Salary', 'Location', 'Description', 'Skills']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert to string and encode
    label_encoders[col] = le  # Store encoders for later use

# Define Features and Target
X = df.drop(columns=['Fradulent'])  # Features
y = df['Fradulent']  # Target Variable

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on Test Data
y_pred = rf_model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the model for later use
joblib.dump(rf_model, "random_forest_naukri.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
