In [3]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

# Load Dataset (Replace with actual file path)
df = pd.read_csv("/workspaces/Project/Data_Job_Postings.csv")

# Drop 'Posted Date' if exists
df.drop(columns=['Posted Date'], inplace=True, errors='ignore')

# Handle missing values
df.fillna("Unknown", inplace=True)

# Encode categorical columns
categorical_cols = ['Description', 'Experience', 'Salary']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Store encoders for future use

# Features and Target
X = df[['Rating', 'Reviews', 'Description', 'Experience', 'Salary']]
y = df['Fradulent']  # Target column (1 = Fake, 0 = Genuine)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Models
nb_model = GaussianNB()
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Ensemble Model
ensemble_model = VotingClassifier(estimators=[
    ('naive_bayes', nb_model),
    ('random_forest', rf_model)
], voting='soft')

# Train Model
ensemble_model.fit(X_train_scaled, y_train)

# Make Predictions
y_pred = ensemble_model.predict(X_test_scaled)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Ensemble Model Accuracy: {accuracy:.2f}")

# Save Model, Scaler & Encoders
joblib.dump(ensemble_model, "ensemble_naukri.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

print("🚀 Model training complete and saved!")


✅ Ensemble Model Accuracy: 1.00
🚀 Model training complete and saved!


In [6]:
import pandas as pd
import joblib

# Load the saved model, scaler, and encoders
ensemble_model = joblib.load("ensemble_naukri.pkl")
scaler = joblib.load("scaler.pkl")
label_encoders = joblib.load("label_encoders.pkl")

# Sample New Job Posting Data (Modify to test different cases)
new_job = pd.DataFrame([{
    'Rating': 4,  # Moderate rating
    'Reviews': 900,  # Moderate reviews
    'Description': 'Work from home opportunity for sales executives...',
    'Experience': '1-3 yrs',
    'Salary': '5-7 LPA'
}])

# Encode categorical columns
for col in ['Description', 'Experience', 'Salary']:
    if new_job[col].iloc[0] in label_encoders[col].classes_:
        new_job[col] = label_encoders[col].transform(new_job[col].astype(str))
    else:
        new_job[col] = -1  # Handle unseen labels

# Scale Features
new_job_scaled = scaler.transform(new_job)

# Predict
prediction = ensemble_model.predict(new_job_scaled)
print("🔍 Prediction:", "❌ Fake Job" if prediction[0] == 1 else "✅ Genuine Job")


🔍 Prediction: ✅ Genuine Job
