# Fraudulent Jobs  Detection System 

In [None]:
# loading the training dataset
import pandas as pd
import numpy as np

train_df = pd.read_csv("train_dataset.csv")   # 14304 rows × 18 columns
train_df.isnull().describe() # we got department and salary_range having most of the null val so dropped also job_id as meaningless

train_df.drop(['job_id','department','salary_range'],axis = 1, inplace = True)

# handling Null values in text columns with space

col = ['title','location','company_profile','description','requirements','benefits']
train_df[col] = train_df[col].fillna(" ")

# train_df.isnull().describe()

train_df = train_df.fillna("unknown",axis = 1)
train_df['fraudulent'] = train_df['fraudulent'].astype(int)

# train_df.drop("text",axis = 1, inplace = )
# Now the data is clean and having 0 null values


In [None]:
# merging all text in text column
train_df["text"] = (
    train_df["title"].fillna('') + " " +
    train_df["location"].fillna('') + " " +
    train_df["company_profile"].fillna('') + " " +
    train_df["description"].fillna('') + " " +
    train_df["requirements"].fillna('') + " " +
    train_df["benefits"].fillna('')
)
# train_df['text'].nunique()  to check uniqueness of text   &  train_df['text'].iloc[0]


In [None]:
# using TF_IDF vectorize for text to numbers
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X = vectorizer.fit_transform(train_df['text'])
# print("TF-IDF matrix shape:", X.shape)


In [None]:
# splitting data into test and train and training the model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

# target variable
y = train_df['fraudulent']  

# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# evaluation
y_pred = model.predict(X_test)
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
# fitting the model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)


In [None]:
# saving the model
import joblib
joblib.dump(model, 'fraud_job_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')