# Cluebot - Data Gathering and Feature Engineering

- Use Wikipedia API to obtain the difference of each article before and after each edit
- Feature engineering: vandalism score by applying Bayes' Theorem to text difference
- Feature engineering: whether the editor is an IP or an account
- Feature engineering: account age at the time of edit

In [None]:
# Fix Cluebot data: use Wikipedia API to obtain the difference of each article before and after each edit
import xml.etree.ElementTree as ET
import requests

tree = ET.parse('/Users/robin/Documents/GitHub/Cluebot/trial-edits.xml') # train-edits.xml, bayes-edits.xml, trial-edits.xml
root = tree.getroot()

edit_ids = [int(edit[1].text) for edit in root]
print(edit_ids[:5])

base_url = "https://en.wikipedia.org/w/api.php"

def fetch_revision_text(revid):
    try:
        params = {
            "action": "query",
            "format": "json",
            "prop": "revisions",
            "revids": revid,
            "rvprop": "ids|content",
            "rvslots": "main"
        }

        response = requests.get(base_url, params=params)
        data = response.json()

        if 'badrevids' in data.get('query', {}):
            print(f"Bad revision ID: {revid}")
            return None, None

        page = next(iter(data.get('query', {}).get('pages', {}).values()), {})
        revision = page.get('revisions', [{}])[0]
        content = revision.get('slots', {}).get('main', {}).get('*')
        parent_id = revision.get('parentid')

        return content, parent_id

    except Exception as e:
        print(f"Error fetching revision {revid}: {e}")
        return None, None

with open('/Users/robin/Documents/GitHub/Cluebot/added_lines.txt', 'a', newline='') as file_1, \
     open('/Users/robin/Documents/GitHub/Cluebot/deleted_lines.txt', 'a', newline='') as file_2: 
    
    # added_lines_bayes.txt, added_lines_trial.txt, deleted_lines_bayes.txt, deleted_lines_trial.txt

    writer_1 = csv.writer(file_1)
    writer_2 = csv.writer(file_2)

    for i in range(5): # len(root)
        edit_id = edit_ids[i]

        current_text, parent_id = fetch_revision_text(edit_id)
        if current_text is None or parent_id is None:
            writer_1.writerow(["BAD REQUEST"])
            writer_2.writerow(["BAD REQUEST"])
            continue

        parent_text, _ = fetch_revision_text(parent_id)
        if parent_text is None:
            writer_1.writerow(["BAD REQUEST"])
            writer_2.writerow(["BAD REQUEST"])
            continue

        diff = list(difflib.unified_diff(
            parent_text.splitlines(),
            current_text.splitlines(),
            fromfile='before',
            tofile='after',
            lineterm=''
        ))

        added_lines = [line[1:] for line in diff if line.startswith('+') and not line.startswith('+++')]
        deleted_lines = [line[1:] for line in diff if line.startswith('-') and not line.startswith('---')]
        
        writer_1.writerow(added_lines)
        writer_2.writerow(deleted_lines)
    

In [None]:
# Combine all Cluebot data and text difference data into one csv file
import pandas as pd
import csv

edits = pd.read_xml('/Users/robin/Documents/GitHub/Cluebot/trial-edits.xml')
edits.sample(5)

# Some features are hidden under the 'common', 'current', 'previous' domains and have to be extracted manually
edits = edits.drop(['common', 'current', 'previous'], axis = 1)
edits['page_made_time'] = [edit[10][0].text for edit in root]
edits['title'] = [edit[10][1].text for edit in root]
edits['namespace'] = [edit[10][2].text for edit in root]
edits['creator'] = [edit[10][3].text for edit in root]
edits['num_recent_edits'] = [edit[10][4].text for edit in root]
edits['num_recent_reversions'] = [edit[10][5].text for edit in root]
edits['current_minor'] = [edit[11][0].text for edit in root]
edits['current_timestamp'] = [edit[11][1].text for edit in root]

with open('/Users/robin/Documents/GitHub/Cluebot/added_lines_trial.txt', 'r', encoding='utf-8') as f:
    edits['added_lines'] = [line.strip() for line in f]

edits['previous_timestamp'] = [edit[12][0].text for edit in root]

with open('/Users/robin/Documents/GitHub/Cluebot/deleted_lines_trial.txt', 'r', encoding='utf-8') as f:
    edits['deleted_lines'] = [line.strip() for line in f]

edits.to_csv('/Users/robin/Documents/GitHub/Cluebot/output_trial.csv', index=False)

In [None]:
# Combine all three csv files
df_train = pd.read_csv('/Users/robin/Documents/GitHub/Cluebot/output.csv')
df_bayes = pd.read_csv('/Users/robin/Documents/GitHub/Cluebot/output_bayes.csv')
df_trial = pd.read_csv('/Users/robin/Documents/GitHub/Cluebot/output_trial.csv')

df_combined = pd.concat([df_train, df_bayes, df_trial], ignore_index=True)
df_combined.to_csv('/Users/robin/Documents/GitHub/Cluebot/output_combined.csv')
df_combined.sample(5)

Unnamed: 0,EditType,EditID,comment,isvandalism,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,prev_user,...,creator,num_recent_edits,num_recent_reversions,current_minor,current_timestamp,added_lines,previous_timestamp,deleted_lines,account_age,is_IP
139,change,327252369,/* Reception */,False,Hunter Kahn,33583,0,0,1203574035,Hunter Kahn,...,Hunter Kahn,0,0,False,1258875399,"""In its original American broadcast on Novembe...",1258874977,"""In its original American broadcast on Novembe...",,
24171,change,250751912,/* Culture */,True,173.32.167.157,2,2,0,20081109233114,JohnCD,...,Infrogmation,0,0,False,1226273474,,1226260374,"==Culture==,===Suffrage===,{{main|Women's Suff...",,
7537,change,328467806,/* Bowl eligible teams in the 2009 season */,False,173.76.5.112,207,33,0,20091128230224,173.76.5.112,...,Greasypacifier,1,0,False,1259449344,*'''Current total number of teams with at leas...,1259449329,*'''Current total number of teams with at leas...,,
30471,change,327179982,/* References */,False,122.169.47.246,6,4,0,20091121220354,12.156.48.122,...,68.106.36.7,0,0,False,1258841034,www.udaipurlive.com,1258651207,,,
2562,change,327619423,/* Healers */,False,NightBear,167,94,0,1201197383,NightBear,...,Conversion script,0,0,True,1259044211,"""* [[Druid (character class)|Druid]]: A priest...",1259043958,"""* [[Druid (character class)|Druid]]: A priest...",,


In [None]:
# Train-test split
from sklearn.model_selection import train_test_split

X = df_combined.drop('isvandalism', axis=1)
y = df_combined['isvandalism']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

train_data = X_train
train_data['isvandalism'] = y_train
train_data.to_csv('train_data.csv', index=False)

test_data = X_test
test_data['isvandalism'] = y_test
test_data.to_csv('test_data.csv', index=False)

In [4]:
train_data = pd.read_csv('/Users/robin/Documents/GitHub/Cluebot/train_data.csv')
test_data = pd.read_csv('/Users/robin/Documents/GitHub/Cluebot/test_data.csv')

In [None]:
# Feature engineering - Vandalism score using Bayes' Theorem

import re

# Count the number of appearances of each word in vandalism/constructive edits
from collections import defaultdict
vandalism_words_count = defaultdict(int)
constructive_words_count = defaultdict(int)

def get_difference(s1, s2):
    w1 = set(re.sub(r"[^\w\s]", " ", s1).lower().split())
    w2 = set(re.sub(r"[^\w\s]", " ", s1).lower().split())
    return w1 - w2

for i in range(len(train_data)):
   for word in get_difference(str(train_data['added_lines'][i]), str(train_data['deleted_lines'][i])):
        if train_data['isvandalism'][i] == True:
            vandalism_words_count[word] += 1
        else:
            constructive_words_count[word] += 1

# Combine all unique words
all_words = set(vandalism_words_count) | set(constructive_words_count)

# Compute smoothed probabilities
word_probs = {
    word: (vandalism_words_count[word] + 1) / 
          (vandalism_words_count[word] + constructive_words_count[word] + 2)
    for word in all_words
}

# Write word probabilities to CSV
with open('/Users/robin/Documents/GitHub/Cluebot/word_prob.csv', 'w', newline='') as file_3:
    writer = csv.writer(file_3)
    writer.writerow(['word', 'probability'])
    writer.writerows(word_probs.items())

# Print highly suspicious words
for word, prob in word_probs.items():
    if prob > 0.9:
        print(word, prob)

# Compute vandalism score for each edit using the Bayes Theorem
def get_vandalism_score(words):
    probs_of_words = [word_probs.get(w, 0.5) for w in words]

    product_p = 1
    product_1_minus_p = 1
    for p in probs_of_words:
        product_p *= p
        product_1_minus_p *= (1-p)
    
    return product_p / (product_p + product_1_minus_p) if (product_p + product_1_minus_p) != 0 else 1

vandalism_scores_train = [get_vandalism_score(get_difference(train_data['added_lines'][i], train_data['deleted_lines'][i])) for i in range(len(train_data))]
vandalism_scores_test = [get_vandalism_score(get_difference(test_data['added_lines'][i], test_data['deleted_lines'][i])) for i in range(len(test_data))]

train_data['vandalism_score'] = vandalism_scores_train
test_data['vandalism_score'] = vandalism_scores_test

train_data.to_csv('/Users/robin/Documents/GitHub/Cluebot/train_data.csv')
test_data.to_csv('/Users/robin/Documents/GitHub/Cluebot/test_data.csv')

train_data.sample(5)


In [8]:
# Feature engineering - The age of account at the time of edit
from datetime import datetime, timezone

def account_age(user_reg_time, current_time):
    if len(user_reg_time) > 10: # this is to exclude edits made by anonymous users
        account_age_days = 1
    else:
        reg_time = datetime.fromtimestamp(int(user_reg_time), tz=timezone.utc)
        edit_time = datetime.fromtimestamp(int(current_time), tz=timezone.utc)
        account_age_days = (edit_time - reg_time).days
    return account_age_days

account_age_train = [account_age(str(train_data['user_reg_time'][i]), train_data['current_timestamp'][i]) for i in range(len(train_data))]
account_age_test = [account_age(str(test_data['user_reg_time'][i]), test_data['current_timestamp'][i]) for i in range(len(test_data))]

train_data['account_age'] = account_age_train
test_data['account_age'] = account_age_test

train_data.to_csv('/Users/robin/Documents/GitHub/Cluebot/train_data.csv')
test_data.to_csv('/Users/robin/Documents/GitHub/Cluebot/test_data.csv')

train_data.sample(5)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,...,num_recent_edits,num_recent_reversions,current_minor,current_timestamp,added_lines,previous_timestamp,deleted_lines,isvandalism,vandalism_score,account_age
15844,15844,15844,change,327003222,/* Education */,AgarwalSumeet,1464,795,0,1159158738,...,0,0,True,1258754612,"""He studied BE from [[NED University]], [[Kara...",1258046948,"""He studied BE from [[NED University]], [[Kara...",False,0.5,1152
6950,6950,6950,change,253204535,/* References */ http://www.soitiz.info/ soiti...,89.194.5.235,3,2,1,20081121154848,...,0,0,False,1227282528,http://www.soitiz.info/ soitiz techno website,1227282352,,True,0.01347755,1
6196,6196,6196,change,238849716,/* Early years */,208.152.32.185,71,43,0,20080916175720,...,0,0,False,1221587840,"""Jackson was regarded as one of the top wide r...",1221576703,"""Jackson was regarded as one of the top wide r...",True,0.9380732,1
20058,20058,20058,change,327609512,Removed category [[:Category:16th-century mona...,Vanished user ewfisn2348tui2f8n2fio2utjfeoi210...,45314,0,1,1181658039,...,0,0,True,1259038657,",[[Category:16th-century Mongolian monarchs]]",1248008358,[[Category:16th-century monarchs]],False,0.25,895
19867,19867,19867,change,329296179,,Tfranklnathens,49,1,0,1251296626,...,0,0,False,1259782693,"""* [http://www.ilsussidiario.net/News/Politics...",1255841891,,False,8.806094e-11,98


In [10]:
# Feature engineering - Whether the edit is made by a registered user or an anonymous IP
import ipaddress

def is_IP(user):
    try:
        ipaddress.ip_address(user)
        return True
    except ValueError:
        return False

is_IP_train = [is_IP(train_data['user'][i]) for i in range(len(train_data))]
is_IP_test = [is_IP(test_data['user'][i]) for i in range(len(test_data))]

train_data['is_IP'] = is_IP_train
test_data['is_IP'] = is_IP_test

train_data.to_csv('/Users/robin/Documents/GitHub/Cluebot/train_data.csv')
test_data.to_csv('/Users/robin/Documents/GitHub/Cluebot/test_data.csv')

train_data.sample(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,...,num_recent_reversions,current_minor,current_timestamp,added_lines,previous_timestamp,deleted_lines,isvandalism,vandalism_score,account_age,is_IP
20310,20310,20310,change,329271775,,12.127.238.46,26,6,2,20091202171702,...,0,False,1259774222,"| name = Johnson Wayne ,| image = John Wayne i...",1259719294,"| name = Bonnie and Clyde ,| image = Bonniecly...",True,0.2630537,1,True
11057,11057,11057,change,328682752,/* References */ +Category:Food-related lists,Takeaway,18258,5387,3,1191542819,...,0,False,1259536405,[[Category:Food-related lists]],1259190804,,False,0.001492598,786,False
20864,20864,20864,change,328059783,,Midway,12001,5854,0,1128264859,...,0,False,1259252677,"""The ruling [[SWAPO]] party set a goal of cont...",1258998927,"""The ruling [[SWAPO]] party set a goal of cont...",False,3.354993e-14,1516,False
2951,2951,2951,change,328254935,"""rock and roll"" and ""rock music"" are two diffe...",Doc Strange,30877,0,1,1129029667,...,0,False,1259351496,"""Shooter Jennings lived his first few years in...",1259337950,"""Shooter Jennings lived his first few years in...",False,0.2897196,1508,False
20788,20788,20788,change,329947745,,Stunningmalik,35,9,0,1142864401,...,0,False,1260052457,"| editor = [[Rashed Rahman]],""The...",1259334193,"| editor = [[Najam Sethi]],""The '...",False,1.055737e-08,1356,False
