In [2]:
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load Dataset (Replace with actual file path)
df = pd.read_csv("/workspaces/Project/Data_Job_Postings.csv")

# Drop 'Posted Date' if exists
df.drop(columns=['Posted Date'], inplace=True, errors='ignore')

# Handle missing values
df.fillna("Unknown", inplace=True)

# Encode categorical columns
categorical_cols = ['Description', 'Experience', 'Salary']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Store encoders for future use

# Features and Target
X = df[['Rating', 'Reviews', 'Description', 'Experience', 'Salary']]
y = df['Fradulent']  # Target column (1 = Fake, 0 = Genuine)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Models
nb_model = GaussianNB()
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
svm_model = SVC(probability=True, kernel='linear', random_state=42)

# Train traditional models
nb_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
svm_model.fit(X_train_scaled, y_train)

# Define RNN Model
rnn_model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1], 1)),
    layers.SimpleRNN(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile RNN model
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Reshape input for RNN
X_train_rnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_rnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Train RNN model
rnn_model.fit(X_train_rnn, y_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_test_rnn, y_test))

# Ensemble Model (Voting Classifier for traditional models)
ensemble_model = VotingClassifier(estimators=[
    ('naive_bayes', nb_model),
    ('random_forest', rf_model),
    ('svm', svm_model)
], voting='soft')

ensemble_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_ensemble = ensemble_model.predict(X_test_scaled)
y_pred_rnn = (rnn_model.predict(X_test_rnn) > 0.5).astype(int).flatten()

# Final prediction (averaging ensemble and RNN)
y_pred_final = np.round((y_pred_ensemble + y_pred_rnn) / 2)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred_final)
print(f"✅ Final Model Accuracy: {accuracy:.2f}")

# Save Models & Scaler
joblib.dump(ensemble_model, "ensemble_naukri.pkl")
rnn_model.save("rnn_naukri.h5")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

print("🚀 Model training complete and saved!")


2025-03-01 16:36:09.332448: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-01 16:36:09.431362: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-01 16:36:09.489106: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740846969.561161   23677 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740846969.583218   23677 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-01 16:36:09.916213: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

Epoch 1/10


2025-03-01 16:36:19.178474: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8689 - loss: 0.3217 - val_accuracy: 0.9444 - val_loss: 0.1254
Epoch 2/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9454 - loss: 0.1269 - val_accuracy: 0.9508 - val_loss: 0.1020
Epoch 3/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9495 - loss: 0.1129 - val_accuracy: 0.9397 - val_loss: 0.1302
Epoch 4/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9496 - loss: 0.1162 - val_accuracy: 0.9619 - val_loss: 0.0780
Epoch 5/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9576 - loss: 0.0953 - val_accuracy: 0.9431 - val_loss: 0.1142
Epoch 6/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9581 - loss: 0.0945 - val_accuracy: 0.9201 - val_loss: 0.1502
Epoch 7/10
[1m473/473[0m [32m━━━━━━━



✅ Final Model Accuracy: 0.98
🚀 Model training complete and saved!


In [4]:
import pandas as pd
import joblib
import numpy as np
import tensorflow as tf

# Load saved models and utilities
ensemble_model = joblib.load("ensemble_naukri.pkl")
rnn_model = tf.keras.models.load_model("rnn_naukri.h5")
scaler = joblib.load("scaler.pkl")
label_encoders = joblib.load("label_encoders.pkl")

# Sample New Job Posting Data
new_job = pd.DataFrame([{
    'Rating': 3,
    'Reviews': 500,
    'Description': 'Hiring experienced data scientists...',
    'Experience': '3-5 yrs',
    'Salary': '12-18 LPA'
}])

# Encode categorical columns
for col in ['Description', 'Experience', 'Salary']:
    if new_job[col].iloc[0] in label_encoders[col].classes_:
        new_job[col] = label_encoders[col].transform(new_job[col].astype(str))
    else:
        new_job[col] = -1  # Handle unseen labels

# Scale Features
new_job_scaled = scaler.transform(new_job)

# Reshape for RNN
new_job_rnn = new_job_scaled.reshape(new_job_scaled.shape[0], new_job_scaled.shape[1], 1)

# Predictions
pred_ensemble = ensemble_model.predict(new_job_scaled)
pred_rnn = (rnn_model.predict(new_job_rnn) > 0.5).astype(int).flatten()

# Final prediction (averaging ensemble and RNN)
final_prediction = np.round((pred_ensemble + pred_rnn) / 2)

print("🔍 Prediction:", "❌ Fake Job" if final_prediction[0] == 1 else "✅ Genuine Job")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
🔍 Prediction: ❌ Fake Job
