In [4]:
import pandas as pd
import re
from collections import Counter
from scipy.stats import chi2_contingency

In [3]:
df = pd.read_csv("/content/Job_posting_Fake_Detection.csv",engine="python")

In [5]:
# Define a list of common buzzwords
buzzwords = ["urgent", "limited", "exclusive", "guaranteed", "earn", "apply now", "immediate", "hiring fast"]

In [6]:
# Preprocess text function
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    words = text.split()  # Tokenization
    return words

In [7]:
# Separate fake and real job postings
fake_posts = df[df["fraudulent"] == "Fake"]["description"].dropna().apply(preprocess_text)
real_posts = df[df["fraudulent"] == "Real"]["description"].dropna().apply(preprocess_text)

In [8]:
# Count buzzword occurrences in each category
fake_word_counts = Counter(word for post in fake_posts for word in post if word in buzzwords)
real_word_counts = Counter(word for post in real_posts for word in post if word in buzzwords)

In [9]:
# Create a DataFrame for chi-square test
buzzword_df = pd.DataFrame({"Fake": fake_word_counts, "Real": real_word_counts}).fillna(0).astype(int)

In [10]:
# Perform chi-square test
chi2_stat, p_value, dof, expected = chi2_contingency(buzzword_df)

In [11]:
# Print results
print("Word Frequency Analysis:\n", buzzword_df)
print("\nChi-square Statistic:", chi2_stat)
print("p-value:", p_value)

Word Frequency Analysis:
             Fake  Real
limited       75  1137
earn          81   227
immediate     29   366
exclusive      6    75
guaranteed    10    59
urgent         9   292

Chi-square Statistic: 143.37276581619372
p-value: 3.4322075969473665e-29


In [13]:
# Interpretation
if p_value < 0.05:
    print("\nConclusion: Fake job postings use certain buzzwords much more often than real ones")
else:
    print("\nConclusion: No significant difference found in buzzword usage between fake and real postings.")


Conclusion: Fake job postings use certain buzzwords much more often than real ones
