In [38]:
# PARAMETERS
percent_anomalous_users = 0.03
percent_anomalous_forums = 0.03

In [39]:
import json
def subset_data(filename, output_file, length):
    output = []
    for i, line in enumerate(open(filename)):
        if i >= length:
            break
        line = json.loads(line)
        line["label"] = 0
        if line["author"] == "[deleted]" or len(line["body"]) > 500:
            length += 1
            continue
        output.append(line)
    with open(output_file, "w") as f:
        for line in output:
            f.write(json.dumps(line) + "\n")
    print("Subsetting Done")
    print("Number of lines: ", len(output))

In [40]:
import json

def get_user_data(input_file_path, output_file_path):
    user_data = {}
    # Example of a line in the input file: author: [(subreddit, body, label), (subreddit, body, label), ...]
    with open(input_file_path, "r") as file:
        for i, line in enumerate(file):
            line_dict = json.loads(line)
            author = line_dict["author"]
            if author not in user_data:
                user_data[author] = []
            user_data[author].append((line_dict["subreddit"], line_dict["body"], line_dict["label"]))
    with open(output_file_path, "w") as file:
        json.dump(user_data, file, indent = 4)
    print("Preprocessing complete")
    print("Number of users: " + str(len(user_data.keys())))
    return user_data

def get_subreddit_data(input_file_path, output_file_path):
    subreddit_data = {}
    # Example of a line in the input file: subreddit: [(author, body, label), (author, body, label), ...]
    with open(input_file_path, "r") as file:
        for i, line in enumerate(file):
            line_dict = json.loads(line)
            subreddit = line_dict["subreddit"]
            if subreddit not in subreddit_data:
                subreddit_data[subreddit] = []
            subreddit_data[subreddit].append((line_dict["author"], line_dict["body"], line_dict["label"]))
    with open(output_file_path, "w") as file:
        json.dump(subreddit_data, file, indent = 4)
    print("Preprocessing complete")
    print("Number of subreddits: " + str(len(subreddit_data.keys())))
    return subreddit_data


In [41]:
subset_data("data\Reddit\RC_2015-01", "data\Reddit\Reddit_subset.json", 30000)

Subsetting Done
Number of lines:  30000


In [42]:
clean_user_data = get_user_data("data\Reddit\Reddit_subset.json", "data\Reddit\Reddit_user_data.json")

Preprocessing complete
Number of users: 20046


In [43]:
subreddit_data = get_subreddit_data("data\Reddit\Reddit_subset.json", "data\Reddit\Reddit_subreddit_data.json")

Preprocessing complete
Number of subreddits: 3055


In [44]:
import random
# Randomly select anomalous users and subreddits
scammers = random.sample(clean_user_data.keys(), int(len(clean_user_data.keys()) * percent_anomalous_users))
scam_subreddits = random.sample(subreddit_data.keys(), 1 + int(len(subreddit_data.keys()) * percent_anomalous_forums))

since Python 3.9 and will be removed in a subsequent version.
  scammers = random.sample(clean_user_data.keys(), int(len(clean_user_data.keys()) * percent_anomalous_users))
since Python 3.9 and will be removed in a subsequent version.
  scam_subreddits = random.sample(subreddit_data.keys(), 1 + int(len(subreddit_data.keys()) * percent_anomalous_forums))


In [45]:
import pandas as pd
email_spam = pd.read_csv("data\Reddit\processed_data.csv")
# Assuming the data is stored in a DataFrame called "df"
email_spam = email_spam[email_spam['message'].apply(lambda x: isinstance(x, str))]

anomalous_data = email_spam[email_spam["label"] == 1]
anomalous_data = anomalous_data["message"].values.tolist()
anomalous_data_filtered = []
for i, message in enumerate(anomalous_data):
    if len(message) < 500 and len(message) > 5:
        anomalous_data_filtered.append(message)
print("Number of anomalous data: ", len(anomalous_data_filtered))

import random
def inject_anomalies(clean_user_data, anomaly_data, scammers, scam_subreddits):
    injected = 0
    inject_user_data = {}
    for user in clean_user_data:
        for i, _ in enumerate(clean_user_data[user]):
            if user not in inject_user_data:
                inject_user_data[user] = []
            if user in scammers:
                inject_user_data[user].append((random.choice(scam_subreddits), # Subreddit
                        anomaly_data.pop(), # Body
                        1 # Label
                        ))
            else:
                inject_user_data[user].append(clean_user_data[user][i])
            injected += 1
    print("Anomalies injected")
    print("Number of anomalies injected: ", injected)
    return inject_user_data





Number of anomalous data:  9041


In [46]:
user_data = inject_anomalies(clean_user_data, anomalous_data_filtered, scammers, scam_subreddits)
import networkx as nx
def create_gml(data, output_file):
    G = nx.Graph()
    for user, post in data.items():
        for edge in post:
            G.add_edge(user, edge[0], attr=edge[2])
    nx.write_edgelist(G, "data/Reddit/edgelist_attr.txt")
    G = nx.bipartite.read_edgelist("data/Reddit/edgelist_attr.txt", create_using=nx.DiGraph)
    nx.write_gml(G, output_file)
    print("GML file created")

create_gml(clean_user_data, "data\Reddit\Reddit_clean.gml")
create_gml(user_data, "data\Reddit\Reddit.gml")

Anomalies injected
Number of anomalies injected:  30000
GML file created
GML file created


In [47]:
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# # Filter messages labeled 0 and 1
# messages_0 = data[data['label'] == 0]['body']
# messages_1 = data[data['label'] == 1]['body']

# # Concatenate all messages into a single string
# text_0 = ' '.join(messages_0)
# text_1 = ' '.join(messages_1)

# # Generate word clouds for each label
# wordcloud_0 = WordCloud(background_color = "white").generate(text_0)
# wordcloud_1 = WordCloud(background_color = "white").generate(text_1)

# # Plot the word clouds
# fig, axes = plt.subplots(1, 2, figsize=(10, 5))
# axes[0].imshow(wordcloud_0, interpolation='bilinear')
# axes[0].set_title('Normal Text')
# axes[0].axis('off')

# axes[1].imshow(wordcloud_1, interpolation='bilinear')
# axes[1].set_title('Anomalous Text')
# axes[1].axis('off')

# # Assuming you have a plot named 'fig'
# plt.savefig('wordcloud.png')


In [48]:
# def write_edglist(user_data, output_file):
#     subreddit_list = []
#     edgelist = []
#     with open(output_file, "w") as f:
#         for user_id, (_, post) in enumerate(user_data.items()):
#             for edge in post:
#                 if edge[0] not in subreddit_list:
#                     subreddit_list.append(edge[0])
#                 reddit_id = subreddit_list.index(edge[0])
#                 edgelist.append((user_id, reddit_id))
#         json.dump(edgelist, f, indent = 4)
#     print("Edgelist file created")
# write_edglist(user_data, "data\Reddit\edgelist.json")

In [49]:
import json
def create_hypergraph_edgelist(user_data):
    hypergraph = {}
    for user_id, posts in enumerate(user_data.values()):
        for subreddit, _, _ in posts:
            if subreddit not in hypergraph.keys():
                hypergraph[subreddit] = []
            hypergraph[subreddit].append(user_id)
    with open("data/Reddit/hypergraph.json", "w") as f:
        json.dump(list(hypergraph.values()), f, indent = 4)
    print("Hypergraph file created")
create_hypergraph_edgelist(user_data)

Hypergraph file created


In [50]:
def get_labels(user_data):
    labels = []
    for user, posts in user_data.items():
        _, _, label = posts[0]
        labels.append(label)
    return labels
import json
json.dump(get_labels(user_data), open("data/Reddit/labels.json", "w"), indent = 4)

In [51]:
import string
import time
import nltk
from nltk.corpus import stopwords
    
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    
    # Join the words back into a single string
    cleaned_text = ' '.join(words)
    
    return cleaned_text

def get_all_text(user_data, output_file_path):
    all_text = []
    all_text_labeled = pd.DataFrame(columns = ["v1", "v2"])
    for user, posts in user_data.items():
        label = posts[0][2]
        text = ""
        for (_, body, label) in posts:
            text += body + " "
        text = clean_text(text)
        all_text.append(text)
        all_text_labeled = pd.concat([all_text_labeled, pd.DataFrame({"v1": [label], "v2": [text]})])
    with open(output_file_path, "w") as f:
        json.dump(all_text, f, indent = 4)
    with open("data/Reddit/all_text_labeled.csv", "w", encoding="utf-8") as f:
        all_text_labeled.to_csv(f, index = False)

get_all_text(user_data, "data/Reddit/all_text.json")

In [52]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(input_file_path, output_file_path):
    data = open(input_file_path, "r", encoding="utf-8")
    data = json.load(data)

    vectorizer = CountVectorizer(ngram_range=(1,1))
    vectorizer.fit(data)
    
    bow = vectorizer.transform(data)
    
    with open(output_file_path, 'wb') as file:
        pickle.dump(bow, file)
    return bow

bow = bag_of_words("data/Reddit/all_text.json", "data/Reddit/features.pkl")

In [53]:
def get_train_mask(input_file_path, output_file_path):
    data = open(input_file_path, "r")
    data = json.load(data)

    train_mask = []
    for i, _ in enumerate(data):
        if i % 6 == 0 or i % 6 == 1 or i % 6 == 2 or i % 6 == 3:
            train_mask.append(1)
        else:
            train_mask.append(0)
    output_file = open(output_file_path, "w")
    json.dump(train_mask, output_file, indent = 4)

def get_val_mask(input_file_path, output_file_path):
    data = open(input_file_path, "r")
    data = json.load(data)

    val_mask = []
    for i, user in enumerate(data):
        if i % 6 == 4:
            val_mask.append(1)
        else:
            val_mask.append(0)
    output_file = open(output_file_path, "w")
    json.dump(val_mask, output_file, indent = 4)

def get_test_mask(input_file_path, output_file_path):
    data = open(input_file_path, "r")
    data = json.load(data)

    test_mask = []
    for i, user in enumerate(data):
        if i % 6 == 5:
            test_mask.append(1)
        else:
            test_mask.append(0)
    output_file = open(output_file_path, "w")
    json.dump(test_mask, output_file, indent = 4)

In [54]:
def get_labels(input_file_path, output_file_path):
    data = open(input_file_path, "r")
    data = json.load(data)
    labels = []
    for label in data.values():
        labels.append(label)
    output_file = open(output_file_path, "w")
    json.dump(labels, output_file, indent = 4)
# import dhg
# import json
# def get_labels(input_file_path, output_file_path):
#     edgelist = json.load(open(input_file_path, "r"))
#     G = dhg.BiGraph(len(user_data), len(subreddit_data), edgelist)
#     H = dhg.Hypergraph.from_bigraph(G, U_as_vertex=True)
#     H.draw()

# get_labels("data/Reddit/edgelist.json", "data/Reddit/labels.json")

In [55]:
get_test_mask("data/Reddit/labels.json", "data/Reddit/test_mask.json")
get_val_mask("data/Reddit/labels.json", "data/Reddit/val_mask.json")
get_train_mask("data/Reddit/labels.json", "data/Reddit/train_mask.json")

In [56]:
import pickle
import hashlib

def json_to_pkl(input_file_path, output_file_path):
    with open(input_file_path, "r") as file:
        data = json.load(file)
    with open(output_file_path, "wb") as file:
        pickle.dump(data, file)

def hash(file_name):
    md5_dict = {}
    with open(file_name, "rb") as f:
        data = f.read()
        md5_hash = hashlib.md5(data).hexdigest()
        md5_dict[file_name] = md5_hash
    print("MD5 hashes:")
    for file_name, md5_hash in md5_dict.items():
        print(file_name, ":", md5_hash)

json_to_pkl("data/Reddit/labels.json", "data/Reddit/labels.pkl")
json_to_pkl("data/Reddit/hypergraph.json", "data/Reddit/edgelist.pkl")
json_to_pkl("data/Reddit/train_mask.json", "data/Reddit/train_mask.pkl")
json_to_pkl("data/Reddit/val_mask.json", "data/Reddit/val_mask.pkl")
json_to_pkl("data/Reddit/test_mask.json", "data/Reddit/test_mask.pkl")

hash("data/Reddit/features.pkl")
hash("data/Reddit/edgelist.pkl")
hash("data/Reddit/labels.pkl")
hash("data/Reddit/train_mask.pkl")
hash("data/Reddit/val_mask.pkl")
hash("data/Reddit/test_mask.pkl")


num_classes = 2
labels = json.load(open("data/Reddit/labels.json"))
num_vertices = len(labels)
edge_list = json.load(open("data/Reddit/hypergraph.json"))
num_edges = len(edge_list)
dim_features = bow.shape[1]

print("num_classes:", num_classes)
print("num_vertices:", num_vertices)
print("num_edges:", num_edges)
print("dim_features:", dim_features)

MD5 hashes:
data/Reddit/features.pkl : 1112fc3984449922d719056492acca20
MD5 hashes:
data/Reddit/edgelist.pkl : e025fa72919d104738b9e68f0e6cb26f
MD5 hashes:
data/Reddit/labels.pkl : b4dd2b0082659821330ed9d681f52549
MD5 hashes:
data/Reddit/train_mask.pkl : fedd0650f731ac39b2e09d7a13b951f9
MD5 hashes:
data/Reddit/val_mask.pkl : e24dd1c41d9a425144f17b6a64b3acc8
MD5 hashes:
data/Reddit/test_mask.pkl : b1fe02b4bc6a4991bff77aebba28961f
num_classes: 2
num_vertices: 20046
num_edges: 3017
dim_features: 38820
