In [7]:
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# 📌 **Load Dataset**
df = pd.read_csv("/workspaces/Project/Data_Job_Postings.csv")  # Replace with actual path

# 📌 **Preprocessing**
df.drop(columns=['Posted Date'], inplace=True, errors='ignore')
df.fillna("Unknown", inplace=True)

# 📌 **Encode Categorical Columns**
categorical_cols = ['Description', 'Experience', 'Salary']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = df[col].astype(str)
    unique_classes = list(df[col].unique()) + ["Unknown"]  # 🔹 Add "Unknown" for unseen values
    le.fit(unique_classes)
    df[col] = le.transform(df[col])
    label_encoders[col] = le

# 📌 **Define Features & Target**
X = df[['Rating', 'Reviews', 'Description', 'Experience', 'Salary']]
y = df['Fradulent']

# 📌 **Split Dataset**
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 **Standardize Features**
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 📌 **Train Traditional Models**
nb_model = GaussianNB()
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
svm_model = SVC(probability=True, kernel='linear', random_state=42)

nb_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
svm_model.fit(X_train_scaled, y_train)

# 📌 **Define Advanced RNN Model**
rnn_model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1], 1)),  # Input shape for RNN
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 📌 **Reshape Data for RNN**
X_train_rnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_rnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# 📌 **Train RNN**
rnn_model.fit(X_train_rnn, y_train, epochs=15, batch_size=32, verbose=1, validation_data=(X_test_rnn, y_test))

# 📌 **Ensemble Model**
ensemble_model = VotingClassifier(estimators=[
    ('naive_bayes', nb_model),
    ('random_forest', rf_model),
    ('svm', svm_model)
], voting='soft')

ensemble_model.fit(X_train_scaled, y_train)

# 📌 **Save Models**
joblib.dump(ensemble_model, "ensemble_naukri.pkl")
rnn_model.save("rnn_naukri.h5")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

print("🚀 Model training complete and saved!")


Epoch 1/15
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 20ms/step - accuracy: 0.9076 - loss: 0.3027 - val_accuracy: 0.9265 - val_loss: 0.2276
Epoch 2/15
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - accuracy: 0.9323 - loss: 0.2021 - val_accuracy: 0.9405 - val_loss: 0.1596
Epoch 3/15
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.9443 - loss: 0.1460 - val_accuracy: 0.9444 - val_loss: 0.1298
Epoch 4/15
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.9419 - loss: 0.1436 - val_accuracy: 0.9444 - val_loss: 0.1265
Epoch 5/15
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - accuracy: 0.9483 - loss: 0.1247 - val_accuracy: 0.9481 - val_loss: 0.1153
Epoch 6/15
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.9489 - loss: 0.1209 - val_accuracy: 0.9553 - val_loss: 0.1151
Epoch 7/15
[1m473/



🚀 Model training complete and saved!


In [9]:
import numpy as np
import pandas as pd
import joblib
from tensorflow.keras.models import load_model

# 📌 **Load Encoders, Scaler & Models**
label_encoders = joblib.load("label_encoders.pkl")
scaler = joblib.load("scaler.pkl")
ensemble_model = joblib.load("ensemble_naukri.pkl")
rnn_model = load_model("rnn_naukri.h5")

# 📌 **Test Input: New Job Posting**
test_input = pd.DataFrame({
    "Rating": [2],
    "Reviews": [250],
    "Description": ["Senior Data Scientist"],  # 🔹 New category
    "Experience": ["5 years"],
    "Salary": ["$120,000"]
})

# 📌 **Encode Categorical Data (Handle Unseen Labels)**
for col in ["Description", "Experience", "Salary"]:
    test_input[col] = test_input[col].astype(str)
    if test_input[col][0] in label_encoders[col].classes_:
        test_input[col] = label_encoders[col].transform(test_input[col])
    else:
        print(f"⚠️ New category '{test_input[col][0]}' detected in '{col}'. Using 'Unknown'.")
        test_input[col] = label_encoders[col].transform(["Unknown"])[0]

# 📌 **Standardize Features**
test_input_scaled = scaler.transform(test_input)

# 📌 **Make Predictions (Traditional Models)**
ensemble_prediction = ensemble_model.predict(test_input_scaled)
ensemble_label = "Fake" if ensemble_prediction[0] == 1 else "Genuine"

# 📌 **Reshape for RNN**
test_input_rnn = test_input_scaled.reshape(1, test_input_scaled.shape[1], 1)

# 📌 **Make Predictions (RNN Model)**
rnn_prediction = rnn_model.predict(test_input_rnn)
rnn_label = "Fake" if rnn_prediction[0][0] > 0.5 else "Genuine"

# 📌 **Display Results**
print(f"🔹 Ensemble Model Prediction: {ensemble_label}")
print(f"🔹 RNN Model Prediction: {rnn_label} (Confidence: {rnn_prediction[0][0]:.4f})")




⚠️ New category 'Senior Data Scientist' detected in 'Description'. Using 'Unknown'.
⚠️ New category '5 years' detected in 'Experience'. Using 'Unknown'.
⚠️ New category '$120,000' detected in 'Salary'. Using 'Unknown'.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 452ms/step
🔹 Ensemble Model Prediction: Fake
🔹 RNN Model Prediction: Fake (Confidence: 1.0000)
