In [62]:
# Create an empty a df_new_data DataFrame for storing the user input history

#df_new_data = pd.DataFrame(columns=['role', 'location', 'job_description'])

In [66]:
# Download the lexicon
import nltk
from nltk.corpus import opinion_lexicon

# Get positive and negative words from the lexicon
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

In [75]:
import pickle
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from concurrent.futures import ProcessPoolExecutor
from nltk.tokenize import word_tokenize
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# Welcome message
print("Welcome aboard! Let's navigate through job postings and identify the authentic ones.\nInput the job information to begin!\n")


# Load the Model back from file
Pkl_Filename = "/kaggle/input/pkl-file-model-with-preprocesssing/Fake_Job_Postings_Detection_with_preprocessing.pkl"
with open(Pkl_Filename, 'rb') as file:  
    loaded_model_data = pickle.load(file)

# Extract the loaded data
loaded_ensemble_model = loaded_model_data['model']
tfidf_vectorizer = loaded_model_data['tfidf_vectorizer']
scaler = loaded_model_data['scaler']
min_max_scaler = loaded_model_data['min_max_scaler']
numeric_columns = loaded_model_data['numeric_columns']
optimal_threshold_new_data = loaded_model_data['optimal_threshold_new_data']

# Create or load df_new_data
try:
    df_new_data = pd.read_csv("df_new_data.csv")
except FileNotFoundError:
    df_new_data = pd.DataFrame(columns=['role', 'location', 'job_description', 'combined_text', 'fraudulent', 'positive_score', 'negative_score'])

# Assuming df_new_data has columns: 'role', 'location', 'job_description'
# Get user input for new entries
user_role = input("Enter the role: ")
user_location = input("Enter the location: ")
user_job_description = input("Enter the job description: ")

# Create a new entry DataFrame with the user input
new_entry = pd.DataFrame({
    'role': [user_role],
    'location': [user_location],
    'job_description': [user_job_description]
})

# If needed, fill any missing values in the new entry
new_entry = new_entry.fillna('')

# Combine 'role', 'location', and 'job_description' into a single column
new_entry['combined_text'] = new_entry['role'] + ' ' + new_entry['location'] + ' ' + new_entry['job_description']

# Calculate sentiment scores using the provided function
def calculate_sentiment_scores(row):
    tokens = word_tokenize(row['combined_text'])
    positive_score = np.sum(np.isin(tokens, list(positive_words)))
    negative_score = np.sum(np.isin(tokens, list(negative_words)))
    return pd.Series({'positive_score': positive_score, 'negative_score': negative_score})

# Apply sentiment analysis to the new entry
new_entry[['positive_score', 'negative_score']] = new_entry.apply(calculate_sentiment_scores, axis=1)

# Create additional features: 'desc_num_char', 'desc_num_words', 'desc_num_sent'
new_entry['desc_num_char'] = new_entry['combined_text'].apply(len)
new_entry['desc_num_words'] = new_entry['combined_text'].apply(lambda x: len(x.split()))
new_entry['desc_num_sent'] = new_entry['combined_text'].apply(lambda x: len(nltk.sent_tokenize(x)))

# Create a DataFrame with numeric features
X_numeric_new_entry = new_entry[['positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent']]

# Combine text TF-IDF vectors with numeric features
X_text_new_entry_tfidf = tfidf_vectorizer.transform(new_entry['combined_text'])
X_new_entry = pd.concat([X_numeric_new_entry, pd.DataFrame(X_text_new_entry_tfidf.toarray())], axis=1)

# Standardize numeric features
X_new_entry.iloc[:, :5] = scaler.transform(X_new_entry.iloc[:, :5])

# Apply Min-Max scaling to numeric features
X_new_entry[numeric_columns] = min_max_scaler.transform(X_new_entry[numeric_columns])

# Convert feature names to strings
X_new_entry.columns = X_new_entry.columns.astype(str)

# Predict probabilities for the new entry using the loaded ensemble model
y_pred_prob_new_entry = loaded_ensemble_model.predict_proba(X_new_entry)[:, 1]

# Apply optimal threshold
y_pred_new_entry = (y_pred_prob_new_entry > optimal_threshold_new_data).astype(int)

# Calculate sentiment scores for the new entry
new_entry[['positive_score', 'negative_score']] = new_entry.apply(calculate_sentiment_scores, axis=1)

# Append the new entry to the existing df_new_data
new_entry['fraudulent'] = y_pred_new_entry
df_new_data = pd.concat([df_new_data, new_entry], ignore_index=True)

# Save df_new_data to a CSV file
df_new_data.to_csv("df_new_data.csv", index=False)

# Display or use the predictions for the new entry
print("\nPredictions for the New Entry:")
print(y_pred_new_entry)
# Print statements based on prediction
if y_pred_new_entry[0] == 1:
    print("Entered Job posting is Fake")
else:
    print("Entered Job posting is Real")

Welcome aboard! Let's navigate through job postings and identify the authentic ones.
Input the job information to begin!



Enter the role:   Administrative Assistant
Enter the location:  US, CA, San Francisco
Enter the job description:  The Administrative Assistant will be based in San Francisco, CA. The right candidate will be an inte...



Predictions for the New Entry:
[0]
Entered Job posting is Real


In [76]:
df_new_data

Unnamed: 0,role,location,job_description,combined_text,fraudulent,positive_score,negative_score,desc_num_char,desc_num_words,desc_num_sent
0,d,d,d,d d d,0,0,0,5.0,3.0,1.0
1,WAH Customer Service Repersentative,"US, SC, Columbia",ECHO HEIGHT LLC (WORK AT HOME) is now exceptin...,"WAH Customer Service Repersentative US, SC, Co...",1,1,0,156.0,23.0,1.0
2,Administrative Assistant,"US, CA, San Francisco",The Administrative Assistant will be based in ...,"Administrative Assistant US, CA, San Francisc...",0,1,0,151.0,23.0,2.0
