In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import json 
from pandas.io.json import json_normalize
import pandas as pd
import pickle as pk

from nltk.tokenize import word_tokenize
import statistics
import seaborn as sns
from gensim.models import Word2Vec

# 导入所需的库
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
def get_data_path(data_dir):
    dirs = os.listdir(data_dir)
    
    for path in dirs:
        if "_train_" in path:
            train_path = os.path.join(data_dir,path)
        elif "_val_" in path:
            val_path = os.path.join(data_dir,path)
        elif "_test_" in path:
            test_path = os.path.join(data_dir,path)
    return train_path, val_path, test_path

In [None]:
import nltk
import re
def preprocess(text):
    POS = []
    text = re.sub(r'http\S+', '', text)  # Remove website link
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'@$%^&*()\\', '', text)  #Remove illegal characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', 'newline', text)
    text = re.sub(r"[:;=][)D]|[(][=:;]", 'emoji', text)
    t = nltk.word_tokenize(text)
    t = nltk.pos_tag(t)
    tag = []
    for i in t:
        tag.append(nltk.tag.util.tuple2str(i))
    text = " ".join(tag)
    POS.append(text)
    return text

In [None]:
def getDataJSON(route):
    with open(route,"r",encoding="utf-8") as f:
        result = [json.loads(line) for line in f.read().splitlines()]
    return result

In [None]:
def get_data(path, ngram_range=(1,2)):
    data = pd.DataFrame(getDataJSON(path))
    data['process'] = data['comment'].str.lower().apply(preprocess)
    return data

In [None]:
data_dir = 'darkreddit_authorship_attribution_anon'

train_path, val_path, test_path = get_data_path(data_dir)

In [None]:
train = get_data(train_path)
val = get_data(val_path)
test = get_data(test_path)

In [None]:
data =pd.concat([train, val, test], axis=0)

In [None]:
data['comment'][0]

In [None]:
# Define a function for converting text to vectors
def vectorize_text(text_data):
    vectorizer = CountVectorizer(stop_words='english')
    vectorized = vectorizer.fit_transform(text_data)
    return vectorized, vectorizer
# def vectorize_text(text_data):
#     vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=10000,stop_words='english')
#     vectorized = vectorizer.fit_transform(text_data)
#     tfidf_transformer = TfidfTransformer()
#     vectorized = tfidf_transformer.fit_transform(vectorized)
#     return vectorized, tfidf_transformer

In [None]:
# Define a function to run LDA and output clustering results
def run_lda(text_data, n_topics):
    vectorized, vectorizer = vectorize_text(text_data)
    lda = LatentDirichletAllocation(n_components=n_topics)
    lda.fit_transform(vectorized)
    topic_words = vectorizer.get_feature_names_out()
    topic_keywords = []
    for topic_weights in lda.components_:
        top_keyword_locs = (-topic_weights).argsort()[:9]
        topic_keywords.append([topic_words[i] for i in top_keyword_locs])
    doc_topics = lda.transform(vectorized)
    clusters = np.argmax(doc_topics, axis=1)
    return clusters


In [None]:
data['clusters'] = run_lda(data['comment'],9)
print(data['clusters'])

In [None]:
data

In [None]:
df_0 = data[data['clusters']==0]
df_1 = data[data['clusters']==1]
df_2 = data[data['clusters']==2]
df_3 = data[data['clusters']==3]
df_4 = data[data['clusters']==4]
df_5 = data[data['clusters']==5]
df_6 = data[data['clusters']==6]
df_7 = data[data['clusters']==7]
df_8 = data[data['clusters']==8]
# df_9 = data[data['clusters']==9]

In [None]:
ax0 = sns.countplot(x="author", data=df_0)
plt.title("Author Distribution of Cluster 1 ",fontsize=16)
plt.xlabel("Author",fontsize=16)
plt.ylabel("Count",fontsize=16)
plt.savefig('1.png', dpi=300, bbox_inches='tight')
plt.tight_layout()

In [None]:
sns.countplot(x="author", data=df_1)
plt.title("Author Distribution of Cluster 2 ",fontsize=16)
plt.xlabel("Author",fontsize=16)
plt.ylabel("Count",fontsize=16)
plt.savefig('2.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
sns.countplot(x="author", data=df_2)
plt.title("Author Distribution of Cluster 3 ")
plt.xlabel("Author")
plt.show()

In [None]:
sns.countplot(x="author", data=df_3)
plt.title("Author Distribution of Cluster 4 ",fontsize=16)
plt.xlabel("Author")
plt.ylabel("Count",fontsize=16)
plt.savefig('4.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
sns.countplot(x="author", data=df_4)
plt.title("Author Distribution of Cluster 5 ",fontsize=16)
plt.xlabel("Author",fontsize=16)
plt.ylabel("Count",fontsize=16)

plt.savefig('5.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
sns.countplot(x="author", data=df_5)
plt.title("Author Distribution of Cluster 6 ")
plt.xlabel("Author")
plt.show()

In [None]:
sns.countplot(x="author", data=df_6)
plt.title("Author Distribution of Cluster 7 ")
plt.xlabel("Author")
plt.show()

In [None]:
sns.countplot(x="author", data=df_7)
plt.title("Author Distribution of Cluster 8 ",fontsize=16)
plt.xlabel("Author",fontsize=16)
plt.ylabel("Count",fontsize=16)
plt.savefig('8.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
sns.countplot(x="author", data=df_8)
plt.title("Author Distribution of Cluster 9",fontsize=16)
plt.xlabel("Author",fontsize=16)
plt.ylabel("Count",fontsize=16)
plt.savefig('9.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


def train_test(df):
    count_vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=10000)
    X = count_vectorizer.fit_transform(df['comment'])
    # y = count_vectorizer.fit_transform(df['author'])

    print("CountVectorizer done")

    # Convert word frequency vectors to tf-idf vectors using TfidfTransformer
    print("Start TfidfTransformer")
    tfidf_transformer = TfidfTransformer()
    X = tfidf_transformer.fit_transform(X)
    # y = tfidf_transformer.transform(y)
    print("TfidfTransformer done")
    # vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=10000)
    # X = vectorizer.fit_transform(df['comment'])
    y = np.asarray(df['author'])
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size=0.2, # keep 20% for testing
                                                        random_state=2 # pass an int for reproducible rtesult
                                                        )

    model = MLPClassifier(max_iter=100000, solver='adam', learning_rate='invscaling', hidden_layer_sizes=(172,),
                      alpha=1e-05, activation='logistic')
    model.fit(X_train, y_train)

    # 测试
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred,zero_division=0))
    acc = accuracy_score(y_test, y_pred)
    return acc

In [None]:
# , df_3,df_4,df_5,df_6,df_5,df_6,df_7,df_8,df_9,df_4
df_list =[df_0, df_1,df_2,df_3,df_4,df_5,df_6,df_7,df_8]

In [None]:
accs = []
for df in df_list:
    acc = train_test(df)
    accs.append(acc)

In [None]:
print(accs)

In [None]:
print(np.mean(accs))

In [None]:
acc_all = train_test(data)
print(acc_all)

In [None]:
 # word2vec