In [1]:
import pandas as pd

# Load datasets
train = pd.read_csv('../data/trainingData.csv')
test = pd.read_csv('../data/testData.csv')

# Show shape and columns
print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()

Train shape: (14304, 18)
Test shape: (3577, 17)


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,7531,Contact Center Representatives,"US, VA, Virginia Beach",,,Tidewater Finance Co. was established in 1992 ...,"Tidewater Finance Company, located in Virginia...",The position requires the following qualificat...,Our company offers a competitive salary plus B...,0,1,0,Full-time,Entry level,Unspecified,Financial Services,Customer Service,0
1,130,Customer Service Associate,"US, TX, Dallas",,,"Novitex Enterprise Solutions, formerly Pitney ...",The Customer Service Associate will be based i...,QualificationsMinimum of 1 year customer servi...,,0,1,0,Full-time,Entry level,High School or equivalent,Telecommunications,Customer Service,0
2,4641,Automated Test Analyst,"NZ, , Auckland",Permanent,,SilverStripe CMS &amp; Framework is an open so...,We are looking for a dedicated and passionate ...,,,0,1,1,Full-time,Mid-Senior level,,Information Technology and Services,,0
3,403,Inside Sales Professional-Omaha,"US, NE, Omaha",,,"ABC Supply Co., Inc. is the nation’s largest w...","As a Sales Representative, you will provide as...","As a Sales Representative, you must have the a...",Your benefits package as a Sales Representativ...,0,1,0,Full-time,,,Building Materials,Sales,0
4,13219,Content Marketing/SEO Manager,"US, CA, Los Angeles",Marketing,,MeUndies is a lifestyle brand that is transfor...,MeUndies is a lifestyle brand that is transfor...,REQUIREMENTS/QUALIFICATIONS/PERSONAL ATTRIBUTE...,"WHY MEUNDIES?We're a fast-growing, VC-backed c...",0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Marketing,0


In [2]:
train.isnull().sum()
train['fraudulent'].value_counts(normalize=True)

fraudulent
0    0.951552
1    0.048448
Name: proportion, dtype: float64

In [3]:
import re
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespace
    return text.strip()

# Clean important columns
text_cols = ['title', 'description', 'requirements', 'benefits']
for col in text_cols:
    train[col] = train[col].apply(clean_text)
    test[col] = test[col].apply(clean_text)

# Combine all text into one
train['text'] = train[text_cols].agg(' '.join, axis=1)
test['text'] = test[text_cols].agg(' '.join, axis=1)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(train['text'])
y = train['fraudulent']

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Model
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_val)
print("F1 Score:", f1_score(y_val, y_pred))


F1 Score: 0.6237623762376238


In [5]:
print(train['fraudulent'].value_counts())


fraudulent
0    13611
1      693
Name: count, dtype: int64


In [6]:
import pickle

with open('../model/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('../model/classifier.pkl', 'wb') as f:
    pickle.dump(model, f)
