# Cluebot

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import requests
import difflib
import csv
from sklearn.model_selection import train_test_split
from datetime import datetime, timezone
import xgboost
from sklearn.metrics import confusion_matrix
import ipaddress
import re

In [None]:
# Read Cluebot data
# (We combine the train and test data and then do a train-test split by stratification on is_vandalism)

tree = ET.parse(
    "/Users/robin/Documents/GitHub/Cluebot/trial-edits.xml"
)  # train-edits.xml, bayes-edits.xml, trial-edits.xml
root = tree.getroot()

edit_ids = [int(edit[1].text) for edit in root]
print(edit_ids[:5])

# is_vandalism_values = [1 if edit[3].text == "true" else 0 for edit in root]
# print(is_vandalism_values[:5])

[394517571, 394517573, 394517575, 394517586, 394517626]


In [None]:
# Use Wikipedia API to obtain the difference of each article before and after each edit

base_url = "https://en.wikipedia.org/w/api.php"


def fetch_revision_text(revid):
    try:
        params = {
            "action": "query",
            "format": "json",
            "prop": "revisions",
            "revids": revid,
            "rvprop": "ids|content",
            "rvslots": "main",
        }

        response = requests.get(base_url, params=params)
        data = response.json()

        if "badrevids" in data.get("query", {}):
            print(f"Bad revision ID: {revid}")
            return None, None

        page = next(iter(data.get("query", {}).get("pages", {}).values()), {})
        revision = page.get("revisions", [{}])[0]
        content = revision.get("slots", {}).get("main", {}).get("*")
        parent_id = revision.get("parentid")

        return content, parent_id

    except Exception as e:
        print(f"Error fetching revision {revid}: {e}")
        return None, None


with open(
    "/Users/robin/Documents/GitHub/Cluebot/added_lines.txt", "a", newline=""
) as file_1, open(
    "/Users/robin/Documents/GitHub/Cluebot/deleted_lines.txt", "a", newline=""
) as file_2:

    # added_lines_bayes.txt, added_lines_trial.txt,
    # deleted_lines_bayes.txt, deleted_lines_trial.txt

    writer_1 = csv.writer(file_1)
    writer_2 = csv.writer(file_2)

    for i in range(5):  # len(root)
        edit_id = edit_ids[i]

        current_text, parent_id = fetch_revision_text(edit_id)
        if current_text is None or parent_id is None:
            writer_1.writerow(["BAD REQUEST"])
            writer_2.writerow(["BAD REQUEST"])
            continue

        parent_text, _ = fetch_revision_text(parent_id)
        if parent_text is None:
            writer_1.writerow(["BAD REQUEST"])
            writer_2.writerow(["BAD REQUEST"])
            continue

        diff = list(
            difflib.unified_diff(
                parent_text.splitlines(),
                current_text.splitlines(),
                fromfile="before",
                tofile="after",
                lineterm="",
            )
        )

        added_lines = [
            line[1:]
            for line in diff
            if line.startswith("+") and not line.startswith("+++")
        ]
        deleted_lines = [
            line[1:]
            for line in diff
            if line.startswith("-") and not line.startswith("---")
        ]

        writer_1.writerow(added_lines)
        writer_2.writerow(deleted_lines)

In [None]:
# Combine all Cluebot data and text difference data into one csv file
edits = pd.read_xml("/Users/robin/Documents/GitHub/Cluebot/trial-edits.xml")
edits.sample(5)

# Some features are hidden under the 'common', 'current', 'previous' domains
# and have to be extracted manually

edits = edits.drop(["common", "current", "previous"], axis=1)
edits["page_made_time"] = [edit[10][0].text for edit in root]
edits["title"] = [edit[10][1].text for edit in root]
edits["namespace"] = [edit[10][2].text for edit in root]
edits["creator"] = [edit[10][3].text for edit in root]
edits["num_recent_edits"] = [edit[10][4].text for edit in root]
edits["num_recent_reversions"] = [edit[10][5].text for edit in root]
edits["current_minor"] = [edit[11][0].text for edit in root]
edits["current_timestamp"] = [edit[11][1].text for edit in root]

with open(
    "/Users/robin/Documents/GitHub/Cluebot/added_lines_trial.txt", "r", encoding="utf-8"
) as f:
    edits["added_lines"] = [line.strip() for line in f]

edits["previous_timestamp"] = [edit[12][0].text for edit in root]

with open(
    "/Users/robin/Documents/GitHub/Cluebot/deleted_lines_trial.txt",
    "r",
    encoding="utf-8",
) as f:
    edits["deleted_lines"] = [line.strip() for line in f]

In [770]:
# Feature engineering


# The age of account at the time of edit
def account_age(user_reg_time, current_time):
    if len(user_reg_time) > 10:  # this is to exclude edits made by anonymous users
        account_age_days = 1
    else:
        reg_time = datetime.fromtimestamp(int(user_reg_time), tz=timezone.utc)
        edit_time = datetime.fromtimestamp(int(current_time), tz=timezone.utc)
        account_age_days = (edit_time - reg_time).days
    return account_age_days


edits["account_age"] = account_age(edits["user_reg_time"], edits["current_timestamp"])

# Whether the edit is made by a registered user or an anonymous IP


def is_IP(user):
    try:
        ipaddress.ip_address(user)
        return True
    except ValueError:
        return False


edits["is_IP"] = is_IP(edits["user"])

# User block info

In [766]:
# Store data after feature engineering into a csv file

edits.to_csv("/Users/robin/Documents/GitHub/Cluebot/output_trial.csv", index=False)

In [767]:
# Combine all three csv files

df_train = pd.read_csv("/Users/robin/Documents/GitHub/Cluebot/output.csv")
df_bayes = pd.read_csv("/Users/robin/Documents/GitHub/Cluebot/output_bayes.csv")
df_trial = pd.read_csv("/Users/robin/Documents/GitHub/Cluebot/output_trial.csv")

df_combined = pd.concat([df_train, df_bayes, df_trial], ignore_index=True)
df_combined.to_csv("/Users/robin/Documents/GitHub/Cluebot/output_combined.csv")
df_combined.sample(5)

Unnamed: 0,EditType,EditID,comment,isvandalism,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,prev_user,...,creator,num_recent_edits,num_recent_reversions,current_minor,current_timestamp,added_lines,previous_timestamp,deleted_lines,account_age,is_IP
139,change,327252369,/* Reception */,False,Hunter Kahn,33583,0,0,1203574035,Hunter Kahn,...,Hunter Kahn,0,0,False,1258875399,"""In its original American broadcast on Novembe...",1258874977,"""In its original American broadcast on Novembe...",,
24171,change,250751912,/* Culture */,True,173.32.167.157,2,2,0,20081109233114,JohnCD,...,Infrogmation,0,0,False,1226273474,,1226260374,"==Culture==,===Suffrage===,{{main|Women's Suff...",,
7537,change,328467806,/* Bowl eligible teams in the 2009 season */,False,173.76.5.112,207,33,0,20091128230224,173.76.5.112,...,Greasypacifier,1,0,False,1259449344,*'''Current total number of teams with at leas...,1259449329,*'''Current total number of teams with at leas...,,
30471,change,327179982,/* References */,False,122.169.47.246,6,4,0,20091121220354,12.156.48.122,...,68.106.36.7,0,0,False,1258841034,www.udaipurlive.com,1258651207,,,
2562,change,327619423,/* Healers */,False,NightBear,167,94,0,1201197383,NightBear,...,Conversion script,0,0,True,1259044211,"""* [[Druid (character class)|Druid]]: A priest...",1259043958,"""* [[Druid (character class)|Druid]]: A priest...",,


In [758]:
# Train-test split

X = df_combined.drop("isvandalism", axis=1)
y = df_combined["isvandalism"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

train_data = X_train
train_data["isvandalism"] = y_train
train_data.to_csv("train_data.csv", index=False)

test_data = X_test
test_data["isvandalism"] = y_test
test_data.to_csv("test_data.csv", index=False)

In [976]:
# XGBoost

train_data = pd.read_csv("/Users/robin/Documents/GitHub/Cluebot/train_data.csv")
trial_data = pd.read_csv("/Users/robin/Documents/GitHub/Cluebot/test_data.csv")

xgb_reg1 = xgboost.XGBClassifier(learning_rate=0.1, max_depth=4, n_estimators=10)
xgb_reg1.fit(train_data[["user_edit_count", "user_warns"]], train_data["isvandalism"])
y_pred = xgb_reg1.predict(trial_data[["user_edit_count", "user_warns"]])
print(confusion_matrix(trial_data["isvandalism"], y_pred))

[[2686  587]
 [ 221 2878]]


In [None]:
from collections import defaultdict

vandalism_words_count = defaultdict(int)
constructive_words_count = defaultdict(int)


def get_difference(added_line, deleted_line):
    added_words = set(re.sub(r"[^\w\s]", " ", str(added_line)).lower().split())
    deleted_words = set(re.sub(r"[^\w\s]", " ", str(deleted_line)).lower().split())
    return added_words - deleted_words


def store_words(added_line, deleted_line, is_vandalism):
    new_words = get_difference(added_line, deleted_line)
    for word in new_words:
        if is_vandalism:
            vandalism_words_count[word] += 1
        else:
            constructive_words_count[word] += 1


for i in range(len(train_data)):
    store_words(
        str(train_data["added_lines"][i]),
        str(train_data["deleted_lines"][i]),
        train_data["isvandalism"][i],
    )

In [None]:
# Combine all unique words
all_words = set(vandalism_words_count) | set(constructive_words_count)

# Compute smoothed probabilities
word_probs = {
    word: (vandalism_words_count[word] + 1)
    / (vandalism_words_count[word] + constructive_words_count[word] + 2)
    for word in all_words
}

# Write to CSV
with open(
    "/Users/robin/Documents/GitHub/Cluebot/word_prob.csv", "w", newline=""
) as file_3:
    writer = csv.writer(file_3)
    writer.writerow(["word", "probability"])
    writer.writerows(word_probs.items())

# Print highly suspicious words
for word, prob in word_probs.items():
    if prob > 0.9:
        print(word, prob)

shes 0.9444444444444444
weed 0.9375
gay 0.974477958236659
idiot 0.9696969696969697
alot 0.9487179487179487
hated 0.9166666666666666
ya 0.9122807017543859
bastard 0.9473684210526315
boo 0.9090909090909091
noob 0.9444444444444444
eat 0.912621359223301
crack 0.9090909090909091
fucking 0.9856115107913669
sperm 0.9333333333333333
cant 0.9545454545454546
bunch 0.9047619047619048
yall 0.9411764705882353
butt 0.9888888888888889
willy 0.9333333333333333
bla 0.9090909090909091
douchebag 0.9411764705882353
ass 0.9896907216494846
blah 0.9642857142857143
mexicans 0.9090909090909091
yo 0.9420289855072463
cake 0.9166666666666666
fags 0.9333333333333333
crap 0.9836065573770492
boobs 0.92
fucker 0.9411764705882353
cunts 0.9166666666666666
homosexuals 0.9375
slut 0.9444444444444444
nigger 0.9487179487179487
suck 0.9897435897435898
fat 0.9344262295081968
math 0.9102564102564102
fucks 0.9090909090909091
awesome 0.9702970297029703
infected 0.9090909090909091
goat 0.9285714285714286
pie 0.9423076923076923
h

In [961]:
def get_vandalism_score(words):
    probs_of_words = [word_probs.get(w, 0.5) for w in words]

    product_p = 1
    product_1_minus_p = 1
    for p in probs_of_words:
        product_p *= p
        product_1_minus_p *= 1 - p

    return (
        product_p / (product_p + product_1_minus_p)
        if (product_p + product_1_minus_p) != 0
        else 1
    )

In [977]:
vandalism_scores_train = [
    get_vandalism_score(
        get_difference(train_data["added_lines"][i], train_data["deleted_lines"][i])
    )
    for i in range(len(train_data))
]
vandalism_scores_trial = [
    get_vandalism_score(
        get_difference(trial_data["added_lines"][i], trial_data["deleted_lines"][i])
    )
    for i in range(len(trial_data))
]

train_data["vandalism_score"] = vandalism_scores_train
trial_data["vandalism_score"] = vandalism_scores_trial

train_data.to_csv("/Users/robin/Documents/GitHub/Cluebot/train_data.csv")
trial_data.to_csv("/Users/robin/Documents/GitHub/Cluebot/test_data.csv")

train_data.sample(5)

Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,prev_user,common,...,creator,num_recent_edits,num_recent_reversions,current_minor,current_timestamp,added_lines,previous_timestamp,deleted_lines,isvandalism,vandalism_score
6687,change,234267083,/* Style and interpretation */,70.133.78.45,2,2,0,20080826022601,Considerable powers,,...,Conversion script,0,0,False,1219717561,",Teehee!!",1219682834,,True,0.666667
25352,change,328116556,,216.137.138.62,8,3,0,20091126232849,Igoldste,,...,65.239.44.246,0,0,False,1259278129,"""Founded in 1909, the RMAC is the fifth oldest...",1254625830,"""Founded in 1909, the RMAC is the fourth oldes...",False,0.197531
6790,change,328034368,,121.218.96.253,13,5,0,20091126124223,Vernex3,,...,Rulesfan,0,0,False,1259239343,"= [[Collingwood Football Club]] (2010-),",1259235247,,False,0.018937
5861,change,236354212,/* Swamp pop */,68.212.42.88,3,2,1,20080905004958,66.186.249.57,,...,TUF-KAT,0,0,False,1220575798,"""Swamp pop came about in the mid 1950s. With t...",1217307900,"""Swamp pop came about in the mid 1950s. With t...",True,0.999913
6775,change,327799842,ce,Binksternet,408337,149462,158,1185652797,Binksternet,,...,Qwitchibo,0,0,False,1259121194,"""Almost as soon as they were invented, planes ...",1259121009,"""Almost as soon as they were invented, planes ...",False,0.5


In [981]:
xgb_reg1 = xgboost.XGBClassifier(learning_rate=0.1, max_depth=15, n_estimators=10)
xgb_reg1.fit(train_data[["vandalism_score"]], train_data["isvandalism"])
y_pred = xgb_reg1.predict(trial_data[["vandalism_score"]])
print(confusion_matrix(trial_data["isvandalism"], y_pred))

[[3070  203]
 [ 969 2130]]


In [982]:
xgb_reg1 = xgboost.XGBClassifier(learning_rate=0.1, max_depth=15, n_estimators=10)
xgb_reg1.fit(
    train_data[["user_edit_count", "user_warns", "vandalism_score"]],
    train_data["isvandalism"],
)
y_pred = xgb_reg1.predict(
    trial_data[["user_edit_count", "user_warns", "vandalism_score"]]
)
print(confusion_matrix(trial_data["isvandalism"], y_pred))

[[3095  178]
 [ 471 2628]]
