In [2]:
import pandas as pd
df = pd.read_csv('data/reviews_with_features.csv')

# df["label"]  -->  1 for fake, 0 for real

In [3]:
# Conditions for Fake Reviews
# 1. Rating - Text Mismatch
# 2. Excessive Use of Exclamation Marks or Question Marks
# 3. Use of Generic language
# 4. Repetitive Phrases or Words

In [4]:
#1. Rating - Text Mismatch
short_review = df["review_length"] < 8
rating_extreme = df["stars"].isin([1, 5])

#2 Excessive Use of Exclamation Marks or Question Marks
excessive_punct = (df["exclamation_count"] >= 3) | (df["question_count"] >= 2)

#3 Use of Generic language
generic_phrases = [
    "best ever", "highly recommend", "must visit",
    "worst ever", "never again", "not recommended"
]

df["generic_flag"] = df["text_clean"].apply(
    lambda x: any(phrase in x for phrase in generic_phrases)
)

#4 Repetitive Phrases or Words or low information content
df["unique_ratio"] = df["text_clean"].apply(
    lambda x: len(set(x.split())) / (len(x.split()) + 1)
)

low_info = df["unique_ratio"] < 0.5




In [5]:
# Combining all conditions to create a suspicion score

df["suspicion_score"] = 0

df.loc[short_review & rating_extreme, "suspicion_score"] += 1
df.loc[excessive_punct, "suspicion_score"] += 1
df.loc[df["generic_flag"], "suspicion_score"] += 1
df.loc[low_info, "suspicion_score"] += 1


In [8]:
# Final Labeling --> If suspicion score is 1 or more, label as fake (1), else real (0)
df["label"] = (df["suspicion_score"] >= 1).astype(int)

df["label"].value_counts()

df.to_csv("data/reviews_labeled.csv", index=False)
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   review_id          50000 non-null  object 
 1   stars              50000 non-null  int64  
 2   text               50000 non-null  object 
 3   text_clean         50000 non-null  object 
 4   review_length      50000 non-null  int64  
 5   char_length        50000 non-null  int64  
 6   exclamation_count  50000 non-null  int64  
 7   question_count     50000 non-null  int64  
 8   generic_flag       50000 non-null  bool   
 9   unique_ratio       50000 non-null  float64
 10  suspicion_score    50000 non-null  int64  
 11  label              50000 non-null  int64  
dtypes: bool(1), float64(1), int64(7), object(3)
memory usage: 4.2+ MB
