In [None]:
import pandas as pd
df = pd.read_csv("marketplace_scrape2.xls")
df.head()

Unnamed: 0,title,description,price,seller,contact,timestamp,image_url
0,JBL Tune 760NC,Selling as I upgraded to a newer model.,₹39786,ksoman,iluthra@ravi.biz,2025-11-03 08:53:12,https://images.unsplash.com/photo-151611717287...
1,Xbox Series S,Selling as I upgraded to a newer model.,₹14235,amanigoswami,kalaehsaan@anand.com,2025-10-18 06:16:10,https://images.unsplash.com/photo-160681390277...
2,Steam Deck,"Used for a few months, works perfectly.",₹63888,pranay30,goelbhamini@gmail.com,2025-10-07 21:46:28,https://images.unsplash.com/photo-160681390277...
3,Steam Deck,"Pickup only, no returns.",₹78562,hkonda,ramamiraya@gmail.com,2025-10-16 00:03:54,https://images.unsplash.com/photo-160681390277...
4,Black+Decker Saw,bill or receipt not available.,₹97971,vdalal,darshit02@hotmail.com,2025-10-12 14:50:57,https://images.unsplash.com/photo-158109070022...


✅ STEP 1 — DATA CLEANING & NORMALIZATION

In [None]:
import numpy as np

# helper: convert price string to float
def parse_price(p):
    if pd.isna(p): return np.nan
    s = str(p).replace("₹","").replace(",","").strip()
    try:
        return float(s)
    except:
        return np.nan

# helper: parse timestamp
def parse_time(t):
    try:
        return pd.to_datetime(t)
    except:
        return pd.NaT

# normalize contact field (emails are fine — just lowercase)
df["contact"] = df["contact"].astype(str).str.lower().str.strip()

# convert price
df["price_num"] = df["price"].apply(parse_price)

# parse timestamp properly
df["timestamp"] = df["timestamp"].apply(parse_time)

# extract useful time features
df["date"] = df["timestamp"].dt.date
df["hour"] = df["timestamp"].dt.hour

# create final clean DataFrame
clean = df[[
    "title",
    "description",
    "price_num",
    "seller",
    "contact",
    "timestamp",
    "date",
    "hour",
    "image_url"
]]

# save cleaned file
clean.to_csv("clean_listings.csv", index=False)

clean.head()


Unnamed: 0,title,description,price_num,seller,contact,timestamp,date,hour,image_url
0,JBL Tune 760NC,Selling as I upgraded to a newer model.,39786.0,ksoman,iluthra@ravi.biz,2025-11-03 08:53:12,2025-11-03,8,https://images.unsplash.com/photo-151611717287...
1,Xbox Series S,Selling as I upgraded to a newer model.,14235.0,amanigoswami,kalaehsaan@anand.com,2025-10-18 06:16:10,2025-10-18,6,https://images.unsplash.com/photo-160681390277...
2,Steam Deck,"Used for a few months, works perfectly.",63888.0,pranay30,goelbhamini@gmail.com,2025-10-07 21:46:28,2025-10-07,21,https://images.unsplash.com/photo-160681390277...
3,Steam Deck,"Pickup only, no returns.",78562.0,hkonda,ramamiraya@gmail.com,2025-10-16 00:03:54,2025-10-16,0,https://images.unsplash.com/photo-160681390277...
4,Black+Decker Saw,bill or receipt not available.,97971.0,vdalal,darshit02@hotmail.com,2025-10-12 14:50:57,2025-10-12,14,https://images.unsplash.com/photo-158109070022...


✅ STEP 2A — Detect reused images

Sellers sharing the same image URL → suspicious (common stolen-goods pattern).

In [None]:
# FLAG 1: reused images
image_counts = clean.groupby("image_url")["seller"].nunique()
reused_imgs = set(image_counts[image_counts > 1].index)

clean["reused_image"] = clean["image_url"].isin(reused_imgs)


In [None]:
clean

Unnamed: 0,title,description,price_num,seller,contact,timestamp,date,hour,image_url,reused_image
0,JBL Tune 760NC,Selling as I upgraded to a newer model.,39786.0,ksoman,iluthra@ravi.biz,2025-11-03 08:53:12,2025-11-03,8,https://images.unsplash.com/photo-151611717287...,True
1,Xbox Series S,Selling as I upgraded to a newer model.,14235.0,amanigoswami,kalaehsaan@anand.com,2025-10-18 06:16:10,2025-10-18,6,https://images.unsplash.com/photo-160681390277...,True
2,Steam Deck,"Used for a few months, works perfectly.",63888.0,pranay30,goelbhamini@gmail.com,2025-10-07 21:46:28,2025-10-07,21,https://images.unsplash.com/photo-160681390277...,True
3,Steam Deck,"Pickup only, no returns.",78562.0,hkonda,ramamiraya@gmail.com,2025-10-16 00:03:54,2025-10-16,0,https://images.unsplash.com/photo-160681390277...,True
4,Black+Decker Saw,bill or receipt not available.,97971.0,vdalal,darshit02@hotmail.com,2025-10-12 14:50:57,2025-10-12,14,https://images.unsplash.com/photo-158109070022...,True
...,...,...,...,...,...,...,...,...,...,...
99,Dewalt Cordless Driver,Selling as I upgraded to a newer model.,60310.0,wswaminathan,rsarraf@hotmail.com,2025-10-05 11:53:52,2025-10-05,11,https://images.unsplash.com/photo-158109070022...,True
100,HP Pavilion 15,Selling as I upgraded to a newer model.,127949.0,laganguha,lakshay64@aurora-yogi.com,2025-10-21 07:37:20,2025-10-21,7,https://images.unsplash.com/photo-151733671473...,True
101,Yamaha MT-15,"Must sell fast, price slightly negotiable. Urg...",8620.0,nityakhatri,abramsankar@kothari.com,2025-10-26 01:34:49,2025-10-26,1,https://images.unsplash.com/photo-151865504852...,True
102,Xbox Series S,Selling as I upgraded to a newer model.,21509.0,groverojas,tbutala@gmail.com,2025-10-18 15:22:36,2025-10-18,15,https://images.unsplash.com/photo-160681390277...,True


✅ STEP 2B — Detect reused contact emails

Multiple sellers using the same contact email → VERY suspicious.

In [None]:
# FLAG 2: shared contact emails
contact_counts = clean.groupby("contact")["seller"].nunique()
reused_contacts = set(contact_counts[contact_counts > 1].index)

clean["shared_contact"] = clean["contact"].isin(reused_contacts)


✅ STEP 2C — Suspicious phrases in descriptions

Common stolen-goods language:

“no bill”

“no receipt”

“pickup only”

“no returns”

“must sell fast”

“urgent sale”

In [None]:
pip install rapidfuzz



In [None]:
# FLAG 3: suspicious text phrases
sus_phrases = [
    "no bill",
    "no receipt",
    "pickup only",
    "no returns",
    "must sell fast",
    "urgent sale"
]

def check_phrases(text):
    text = str(text).lower()
    return any(p in text for p in sus_phrases)

clean["suspicious_phrase"] = clean["description"].apply(check_phrases)


In [None]:
clean.head()

Unnamed: 0,title,description,price_num,seller,contact,timestamp,date,hour,image_url,reused_image,shared_contact,suspicious_phrase
0,JBL Tune 760NC,Selling as I upgraded to a newer model.,39786.0,ksoman,iluthra@ravi.biz,2025-11-03 08:53:12,2025-11-03,8,https://images.unsplash.com/photo-151611717287...,True,False,False
1,Xbox Series S,Selling as I upgraded to a newer model.,14235.0,amanigoswami,kalaehsaan@anand.com,2025-10-18 06:16:10,2025-10-18,6,https://images.unsplash.com/photo-160681390277...,True,False,False
2,Steam Deck,"Used for a few months, works perfectly.",63888.0,pranay30,goelbhamini@gmail.com,2025-10-07 21:46:28,2025-10-07,21,https://images.unsplash.com/photo-160681390277...,True,False,False
3,Steam Deck,"Pickup only, no returns.",78562.0,hkonda,ramamiraya@gmail.com,2025-10-16 00:03:54,2025-10-16,0,https://images.unsplash.com/photo-160681390277...,True,False,True
4,Black+Decker Saw,bill or receipt not available.,97971.0,vdalal,darshit02@hotmail.com,2025-10-12 14:50:57,2025-10-12,14,https://images.unsplash.com/photo-158109070022...,True,False,False


In [None]:
# groupwise median per product title
group_median = clean.groupby("title")["price_num"].transform("median")

# flag if price is < 50% of typical price for THAT product
clean["price_outlier"] = clean["price_num"] < (0.5 * group_median)


In [None]:
clean.to_csv("listings_with_flags.csv", index=False)


In [None]:
import pandas as pd
df = pd.read_csv("listings_with_flags.csv")
df.head()

Unnamed: 0,title,description,price_num,seller,contact,timestamp,date,hour,image_url,reused_image,shared_contact,suspicious_phrase,price_outlier
0,JBL Tune 760NC,Selling as I upgraded to a newer model.,39786.0,ksoman,iluthra@ravi.biz,2025-11-03 08:53:12,2025-11-03,8,https://images.unsplash.com/photo-151611717287...,True,False,False,False
1,Xbox Series S,Selling as I upgraded to a newer model.,14235.0,amanigoswami,kalaehsaan@anand.com,2025-10-18 06:16:10,2025-10-18,6,https://images.unsplash.com/photo-160681390277...,True,False,False,False
2,Steam Deck,"Used for a few months, works perfectly.",63888.0,pranay30,goelbhamini@gmail.com,2025-10-07 21:46:28,2025-10-07,21,https://images.unsplash.com/photo-160681390277...,True,False,False,False
3,Steam Deck,"Pickup only, no returns.",78562.0,hkonda,ramamiraya@gmail.com,2025-10-16 00:03:54,2025-10-16,0,https://images.unsplash.com/photo-160681390277...,True,False,True,False
4,Black+Decker Saw,bill or receipt not available.,97971.0,vdalal,darshit02@hotmail.com,2025-10-12 14:50:57,2025-10-12,14,https://images.unsplash.com/photo-158109070022...,True,False,False,False


In [None]:
# --- STEP 1: Seller Posting Burst Detection ---
import pandas as pd

# load the latest file with your flags
clean = pd.read_csv("listings_with_flags.csv")

# make sure timestamp is parsed
clean["timestamp"] = pd.to_datetime(clean["timestamp"], errors="coerce")

# sort by seller and timestamp so we can calculate time gaps
clean = clean.sort_values(["seller", "timestamp"])

# calculate minutes since previous listing by the same seller
clean["mins_since_prev_by_seller"] = (
    clean.groupby("seller")["timestamp"]
         .diff()
         .dt.total_seconds()
         .div(60)
)

# flag if the gap ≤ 60 minutes
clean["seller_burst_60m"] = clean["mins_since_prev_by_seller"] <= 60

# quick check
print("Listings posted within 60 min by same seller:",
      int(clean["seller_burst_60m"].fillna(False).sum()))

# peek a few examples
clean.loc[clean["seller_burst_60m"].fillna(False),
          ["seller","timestamp","title","mins_since_prev_by_seller"]].head(10)


Listings posted within 60 min by same seller: 4


Unnamed: 0,seller,timestamp,title,mins_since_prev_by_seller
42,hkonda,2025-10-16 00:03:55,Steam Deck,0.016667
103,hkonda,2025-10-16 00:03:56,Steam Deck,0.016667
20,ksoman,2025-11-03 08:53:13,JBL Tune 760NC,0.016667
55,ksoman,2025-11-03 08:53:14,JBL Tune 760NC,0.016667


In [None]:
# 6-hour burst detection (extended window for demo clarity)
clean["seller_burst_6h"] = clean["mins_since_prev_by_seller"] <= 360

print("6-hour bursts detected:", int(clean["seller_burst_6h"].fillna(False).sum()))
clean.loc[clean["seller_burst_6h"].fillna(False),
       ["seller", "timestamp", "title", "mins_since_prev_by_seller"]].head(10)

6-hour bursts detected: 4


Unnamed: 0,seller,timestamp,title,mins_since_prev_by_seller
42,hkonda,2025-10-16 00:03:55,Steam Deck,0.016667
103,hkonda,2025-10-16 00:03:56,Steam Deck,0.016667
20,ksoman,2025-11-03 08:53:13,JBL Tune 760NC,0.016667
55,ksoman,2025-11-03 08:53:14,JBL Tune 760NC,0.016667


In [None]:
clean.head()

Unnamed: 0,title,description,price_num,seller,contact,timestamp,date,hour,image_url,reused_image,shared_contact,suspicious_phrase,price_outlier,mins_since_prev_by_seller,seller_burst_60m,seller_burst_6h
86,HP Pavilion 15,"Must sell fast, price slightly negotiable.",77016.0,aarav78,thamanindranil@gmail.com,2025-10-28 23:59:50,2025-10-28,23,https://images.unsplash.com/photo-151733671473...,True,False,True,False,,False,False
1,Xbox Series S,Selling as I upgraded to a newer model.,14235.0,amanigoswami,kalaehsaan@anand.com,2025-10-18 06:16:10,2025-10-18,6,https://images.unsplash.com/photo-160681390277...,True,False,False,False,,False,False
59,DJI Pocket 2,"Urgent sale, need cash fast.",122892.0,amirachadha,cdate@saha-saxena.com,2025-10-24 13:46:24,2025-10-24,13,https://images.unsplash.com/photo-152617037588...,True,False,True,False,,False,False
10,Boat Rockerz 450,"Pickup only, no returns.",94597.0,anahi69,yogikartik@wali.com,2025-10-18 13:46:38,2025-10-18,13,https://images.unsplash.com/photo-151611717287...,True,False,True,False,,False,False
24,Fossil Chronograph,Well maintained and tested before listing.,141328.0,anikacherian,lalllakshay@srinivas.com,2025-10-20 04:16:39,2025-10-20,4,https://images.unsplash.com/photo-152327533568...,True,False,False,False,,False,False


In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
print(df["timestamp"].dtypes)


datetime64[ns]


In [None]:
# Seller velocity (posts per active day) — minimal
seller_stats = (
    clean.groupby("seller")
         .agg(first_post=("timestamp","min"),
              last_post=("timestamp","max"),
              total_posts=("title","count"))
         .reset_index()
)

# active days (at least 1 day)
seller_stats["active_days"] = (seller_stats["last_post"] - seller_stats["first_post"]).dt.days + 1
seller_stats["posts_per_day"] = seller_stats["total_posts"] / seller_stats["active_days"]

# flag: > 2 posts/day (tunable)
seller_stats["high_velocity_seller"] = seller_stats["posts_per_day"] > 2

# merge flags back into clean
clean = clean.merge(seller_stats[["seller","posts_per_day","high_velocity_seller"]], on="seller", how="left")

# show summary and top by velocity
print("High-velocity sellers:", int(seller_stats["high_velocity_seller"].sum()))
seller_stats.sort_values("posts_per_day", ascending=False).head(10)


High-velocity sellers: 2


Unnamed: 0,seller,first_post,last_post,total_posts,active_days,posts_per_day,high_velocity_seller
34,hkonda,2025-10-16 00:03:54,2025-10-16 00:03:56,3,1,3.0,True
43,ksoman,2025-11-03 08:53:12,2025-11-03 08:53:14,3,1,3.0,True
2,amirachadha,2025-10-24 13:46:24,2025-10-24 13:46:24,1,1,1.0,False
0,aarav78,2025-10-28 23:59:50,2025-10-28 23:59:50,1,1,1.0,False
3,anahi69,2025-10-18 13:46:38,2025-10-18 13:46:38,1,1,1.0,False
4,anikacherian,2025-10-20 04:16:39,2025-10-20 04:16:39,1,1,1.0,False
6,aradhya39,2025-10-21 13:49:36,2025-10-21 13:49:36,1,1,1.0,False
5,anyaatwal,2025-10-09 02:28:00,2025-10-09 02:28:00,1,1,1.0,False
8,azaddas,2025-10-11 02:24:42,2025-10-11 02:24:42,1,1,1.0,False
9,bailaarna,2025-11-02 08:09:21,2025-11-02 08:09:21,1,1,1.0,False


“Seller velocity was computed as listings per active day. A threshold of 2 listings/day was used to flag potential high-volume resellers. Most sellers posted only once, but this metric is critical in identifying bulk or automated sellers in larger datasets.”

In [None]:
clean.head()

Unnamed: 0,title,description,price_num,seller,contact,timestamp,date,hour,image_url,reused_image,shared_contact,suspicious_phrase,price_outlier,mins_since_prev_by_seller,seller_burst_60m,seller_burst_6h,posts_per_day,high_velocity_seller
0,HP Pavilion 15,"Must sell fast, price slightly negotiable.",77016.0,aarav78,thamanindranil@gmail.com,2025-10-28 23:59:50,2025-10-28,23,https://images.unsplash.com/photo-151733671473...,True,False,True,False,,False,False,1.0,False
1,Xbox Series S,Selling as I upgraded to a newer model.,14235.0,amanigoswami,kalaehsaan@anand.com,2025-10-18 06:16:10,2025-10-18,6,https://images.unsplash.com/photo-160681390277...,True,False,False,False,,False,False,1.0,False
2,DJI Pocket 2,"Urgent sale, need cash fast.",122892.0,amirachadha,cdate@saha-saxena.com,2025-10-24 13:46:24,2025-10-24,13,https://images.unsplash.com/photo-152617037588...,True,False,True,False,,False,False,1.0,False
3,Boat Rockerz 450,"Pickup only, no returns.",94597.0,anahi69,yogikartik@wali.com,2025-10-18 13:46:38,2025-10-18,13,https://images.unsplash.com/photo-151611717287...,True,False,True,False,,False,False,1.0,False
4,Fossil Chronograph,Well maintained and tested before listing.,141328.0,anikacherian,lalllakshay@srinivas.com,2025-10-20 04:16:39,2025-10-20,4,https://images.unsplash.com/photo-152327533568...,True,False,False,False,,False,False,1.0,False


In [None]:
# TF-IDF near-duplicate description detection (minimal)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# parameters
THRESH = 0.85  # similarity threshold (tuneable)

# prepare text
clean["desc_clean"] = clean["description"].astype(str).str.strip().str.lower().fillna("")
texts = clean["desc_clean"].tolist()

# vectorize
vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
X = vec.fit_transform(texts)

# compute pairwise cosine similarity (dense ok for ~100 rows)
sim = cosine_similarity(X)

# ignore self-similarity by zeroing diagonal
np.fill_diagonal(sim, 0.0)

# find pairs above threshold
pairs = []
n = sim.shape[0]
for i in range(n):
    for j in range(i+1, n):
        s = sim[i, j]
        if s >= THRESH:
            pairs.append((i, j, float(s)))

# create flag: a listing is near-duplicate if it has >=1 pair match
near_flag = np.any(sim >= THRESH, axis=1)
clean["dup_description_near"] = near_flag

# summary
print("Threshold:", THRESH)
print("Near-duplicate listings flagged:", int(near_flag.sum()))
print("Number of near-duplicate pairs:", len(pairs))

# show top pairs (sorted by similarity)
pairs_sorted = sorted(pairs, key=lambda x: -x[2])[:15]
for i,j,s in pairs_sorted:
    a = clean.loc[i, "desc_clean"][:120]
    b = clean.loc[j, "desc_clean"][:120]
    print(f"\nPair ({i},{j}) sim={s:.3f}")
    print(" A:", a)
    print(" B:", b)

# save updated dataset
clean.to_csv("listings_with_flags_updated.csv", index=False)
print("\nSaved: listings_with_flags_updated.csv (includes dup_description_near)")


Threshold: 0.85
Near-duplicate listings flagged: 95
Number of near-duplicate pairs: 419

Pair (0,39) sim=1.000
 A: must sell fast, price slightly negotiable.
 B: must sell fast, price slightly negotiable.

Pair (0,42) sim=1.000
 A: must sell fast, price slightly negotiable.
 B: must sell fast, price slightly negotiable.

Pair (0,63) sim=1.000
 A: must sell fast, price slightly negotiable.
 B: must sell fast, price slightly negotiable.

Pair (0,74) sim=1.000
 A: must sell fast, price slightly negotiable.
 B: must sell fast, price slightly negotiable.

Pair (3,5) sim=1.000
 A: pickup only, no returns.
 B: pickup only, no returns.

Pair (3,14) sim=1.000
 A: pickup only, no returns.
 B: pickup only, no returns.

Pair (3,34) sim=1.000
 A: pickup only, no returns.
 B: pickup only, no returns.

Pair (3,35) sim=1.000
 A: pickup only, no returns.
 B: pickup only, no returns.

Pair (3,36) sim=1.000
 A: pickup only, no returns.
 B: pickup only, no returns.

Pair (3,40) sim=1.000
 A: pickup only, 

What this does, in plain words:

builds TF-IDF vectors (1–2 grams) of descriptions,

computes cosine similarity between every pair,

flags a listing if it’s similar ≥ 0.85 to any other (marking paraphrases),

prints how many listings and pairs were found, and saves the updated CSV.

In [None]:
df = pd.read_csv("listings_with_flags_updated.csv")
df.head()

Unnamed: 0,title,description,price_num,seller,contact,timestamp,date,hour,image_url,reused_image,shared_contact,suspicious_phrase,price_outlier,mins_since_prev_by_seller,seller_burst_60m,seller_burst_6h,posts_per_day,high_velocity_seller,desc_clean,dup_description_near
0,HP Pavilion 15,"Must sell fast, price slightly negotiable.",77016.0,aarav78,thamanindranil@gmail.com,2025-10-28 23:59:50,2025-10-28,23,https://images.unsplash.com/photo-151733671473...,True,False,True,False,,False,False,1.0,False,"must sell fast, price slightly negotiable.",True
1,Xbox Series S,Selling as I upgraded to a newer model.,14235.0,amanigoswami,kalaehsaan@anand.com,2025-10-18 06:16:10,2025-10-18,6,https://images.unsplash.com/photo-160681390277...,True,False,False,False,,False,False,1.0,False,selling as i upgraded to a newer model.,True
2,DJI Pocket 2,"Urgent sale, need cash fast.",122892.0,amirachadha,cdate@saha-saxena.com,2025-10-24 13:46:24,2025-10-24,13,https://images.unsplash.com/photo-152617037588...,True,False,True,False,,False,False,1.0,False,"urgent sale, need cash fast.",True
3,Boat Rockerz 450,"Pickup only, no returns.",94597.0,anahi69,yogikartik@wali.com,2025-10-18 13:46:38,2025-10-18,13,https://images.unsplash.com/photo-151611717287...,True,False,True,False,,False,False,1.0,False,"pickup only, no returns.",True
4,Fossil Chronograph,Well maintained and tested before listing.,141328.0,anikacherian,lalllakshay@srinivas.com,2025-10-20 04:16:39,2025-10-20,4,https://images.unsplash.com/photo-152327533568...,True,False,False,False,,False,False,1.0,False,well maintained and tested before listing.,True


In [None]:
import pandas as pd

# Load your dataset
clean = pd.read_csv("listings_with_flags_updated.csv")

# Assign higher weights to the first 4 indicators
weights = {
    "reused_image": 2.0,
    "shared_contact": 2.0,
    "suspicious_phrase": 2.0,
    "price_outlier": 2.0,
    "seller_burst_60m": 1.0,
    "seller_burst_6h": 1.0,
    "high_velocity_seller": 1.0,
    "dup_description_near": 0.5
}

# Calculate weighted risk score
clean["risk_score_weighted"] = sum(clean[k].astype(int) * v for k, v in weights.items())

# Apply threshold = 3
THRESH = 3
clean["suspicious"] = (clean["risk_score_weighted"] >= THRESH).astype(int)

# Assign readable risk level
def risk_label(score):
    if score < 2:
        return "Low"
    elif score < 3:
        return "Medium"
    else:
        return "High"

clean["risk_level"] = clean["risk_score_weighted"].apply(risk_label)

# Summary
print("✅ Threshold:", THRESH)
print(clean["risk_level"].value_counts())
print("Suspicious flagged:", clean["suspicious"].sum())

# Preview top risky
display(clean.sort_values("risk_score_weighted", ascending=False)
        [["title","seller","risk_score_weighted","risk_level","suspicious"]].head(50))

# Save
clean.to_csv("labelled_dataset_final.csv", index=False)
print("💾 Saved as labelled_dataset_final.csv")


✅ Threshold: 3
risk_level
High      75
Medium    29
Name: count, dtype: int64
Suspicious flagged: 75


Unnamed: 0,title,seller,risk_score_weighted,risk_level,suspicious
48,Sony WH-1000XM5,kumarzoya,8.5,High,1
55,TVS Apache 160,mishtireddy,8.5,High,1
44,Royal Enfield Classic 350,kiaramadan,8.0,High,1
36,Steam Deck,hkonda,7.5,High,1
35,Steam Deck,hkonda,7.5,High,1
19,Nintendo Switch OLED,ddani,6.5,High,1
32,Dell XPS 13,gwadhwa,6.5,High,1
23,Samsung Galaxy S23 Ultra,dsathe,6.5,High,1
57,Canon EOS 250D,nganesan,6.5,High,1
60,Yamaha MT-15,nityakhatri,6.5,High,1


💾 Saved as labelled_dataset_final.csv


In [None]:
clean.head()

Unnamed: 0,title,description,price_num,seller,contact,timestamp,date,hour,image_url,reused_image,...,mins_since_prev_by_seller,seller_burst_60m,seller_burst_6h,posts_per_day,high_velocity_seller,desc_clean,dup_description_near,risk_score_weighted,suspicious,risk_level
0,HP Pavilion 15,"Must sell fast, price slightly negotiable.",77016.0,aarav78,thamanindranil@gmail.com,2025-10-28 23:59:50,2025-10-28,23,https://images.unsplash.com/photo-151733671473...,True,...,,False,False,1.0,False,"must sell fast, price slightly negotiable.",True,4.5,1,High
1,Xbox Series S,Selling as I upgraded to a newer model.,14235.0,amanigoswami,kalaehsaan@anand.com,2025-10-18 06:16:10,2025-10-18,6,https://images.unsplash.com/photo-160681390277...,True,...,,False,False,1.0,False,selling as i upgraded to a newer model.,True,2.5,0,Medium
2,DJI Pocket 2,"Urgent sale, need cash fast.",122892.0,amirachadha,cdate@saha-saxena.com,2025-10-24 13:46:24,2025-10-24,13,https://images.unsplash.com/photo-152617037588...,True,...,,False,False,1.0,False,"urgent sale, need cash fast.",True,4.5,1,High
3,Boat Rockerz 450,"Pickup only, no returns.",94597.0,anahi69,yogikartik@wali.com,2025-10-18 13:46:38,2025-10-18,13,https://images.unsplash.com/photo-151611717287...,True,...,,False,False,1.0,False,"pickup only, no returns.",True,4.5,1,High
4,Fossil Chronograph,Well maintained and tested before listing.,141328.0,anikacherian,lalllakshay@srinivas.com,2025-10-20 04:16:39,2025-10-20,4,https://images.unsplash.com/photo-152327533568...,True,...,,False,False,1.0,False,well maintained and tested before listing.,True,2.5,0,Medium


In [None]:

clean.to_csv("labelled_dataset_final.csv", index=False)
print("💾 Saved as labelled_dataset_final.csv")



💾 Saved as labelled_dataset_final.csv


In [None]:
import networkx as nx

G = nx.Graph()

for _, row in df.iterrows():
    seller = f"seller:{row['seller']}"
    contact = f"contact:{row['contact']}"
    image = f"image:{row['image_url']}"

    G.add_node(seller, type='seller')
    G.add_node(contact, type='contact')
    G.add_node(image, type='image')

    G.add_edge(seller, contact)
    G.add_edge(seller, image)

print(f"✅ Created network with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

# Optionally export for Gephi or visualization
nx.write_gexf(G, "seller_network.gexf")

✅ Created network with 188 nodes and 200 edges


Index(['title', 'description', 'price_num', 'seller', 'contact', 'timestamp',
       'date', 'hour', 'image_url', 'reused_image', 'shared_contact',
       'suspicious_phrase', 'price_outlier', 'mins_since_prev_by_seller',
       'seller_burst_60m', 'seller_burst_6h', 'posts_per_day',
       'high_velocity_seller', 'desc_clean', 'dup_description_near'],
      dtype='object')