In [2]:
import os

# TODO: add a field for trusted users. A user is trusted if they're a moderator or a long-time sub.
# TODO: Discard bot messages. Consider also discarding mod messages since they're often the only ones allowed to send links.

data = {} 
# of the form:
# {"channel1":
#    {
#     "messages": ["messsage1", "message2"...],
#     "bad_messages": [4, 18...], (indices of messages)
#     "viewers": 482 (average)
#    }
#  "channel2":
#   {...}
# }


# look through every data file
for filename in os.listdir("InitialTestData"):
    if os.path.isfile("InitialTestData/" + filename):
        # get channel name
        channel = filename.split("#")[1].split(".")[0]
        if not channel in data:
            data[channel] = {"viewers": [], "messages": [], "bad_messages": []}
        with open("InitialTestData/" + filename) as file:
            lines = []
            for line in file.readlines():
                # only care about timestamped lines. Others are overhead data that we don't mind.
                if line[0] == "[":
                    # get rid of the timestamp, we only want the message itself.
                    line = line[line.find("] ")+2:]
                    # check for standard message sent by a user.
                    if line[0] == "<":
                        lines.append(line)
                    # check for overhead message stating viewer count.
                    elif line[0:8] == "VIEWERS:":
                        # for now add every viewer count to a list for averaging later on.
                        data[channel]["viewers"].append(int(line[9:].replace("\xa0", "")))
                    # check for overhead message stating a user was banned
                    elif line[0:4] == "BAN:":
                        # find most recent message sent by banned user and mark as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[5:].split(" ")[0] + ">" in msg:
                                data[channel]["bad_messages"].append(len(data[channel]["messages"]) + i)
                                break
                    # check for overhead message stating a message was deleted.
                    elif line[0:8] == "DELETED:":
                        # find the deleted message and mark it as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[9:].split(" (")[0] + "> " + line[line.find(" (")+2:-1] in msg:
                                data[channel]["bad_messages"].append(i)
                                break
            # sort bad message indices.
            data[channel]["bad_messages"] = sorted(set(data[channel]["bad_messages"]))
            # remove names from messages and add to data.
            for temp in lines:
                temp = temp[temp.find(">")+2:]
                # non-functioning attempt at anonomizing @-mentions.
                #index = temp.find("@")
                #while index != -1:
                #    temp.replace(temp[index:temp.find(" ", index)], "@user")
                #    index = temp.find("@", index+1)
                data[channel]["messages"].append(temp)
# average viewer counts by channel.
for channel in data.keys():
    avg_viewers = int(sum(data[channel]["viewers"]) / len(data[channel]["viewers"]))
    data[channel]["viewers"] = avg_viewers




In [3]:
for channel in data.keys():
    print("channel:", channel,
          "average viewers:", data[channel]["viewers"],
          "total messages:", len(data[channel]["messages"]),
          "bad messages:", len(data[channel]["bad_messages"]))


channel: baalorlord average viewers: 1361 total messages: 5173 bad messages: 0
channel: cirno_tv average viewers: 503 total messages: 8819 bad messages: 0
channel: aicandii average viewers: 720 total messages: 10331 bad messages: 2
channel: otzdarva average viewers: 6723 total messages: 11503 bad messages: 10
channel: amouranth average viewers: 8123 total messages: 29611 bad messages: 749
channel: fextralife average viewers: 13202 total messages: 10548 bad messages: 7
channel: xqc average viewers: 51958 total messages: 364611 bad messages: 3682
channel: cohhcarnage average viewers: 12895 total messages: 54824 bad messages: 31


In [4]:
discarded_channels = []
for channel in data.keys():
    if data[channel]["viewers"] >= 10000 or len(data[channel]["bad_messages"]) == 0:
        discarded_channels.append(channel)
        

In [5]:
import pandas as pd
formatted_data = []
for channel in data.keys():
    if channel in discarded_channels:
        continue
    next_bad = 0
    for index in range(len(data[channel]["messages"])):
        row = []
        if next_bad < len(data[channel]["bad_messages"]) and data[channel]["bad_messages"][next_bad] == index:
            print(
                channel,
                data[channel]["messages"][index],
                data[channel]["bad_messages"][next_bad],
                len(data[channel]["messages"])
            )
            row = ["bad", data[channel]["messages"][index], channel]
            next_bad += 1
            if next_bad < len(data[channel]["bad_messages"]):
                print("next:", data[channel]["bad_messages"][next_bad])
        else:
            row = ["good", data[channel]["messages"][index], channel]
        formatted_data.append(row)
df = pd.DataFrame(formatted_data, columns=["status", "message", "channel"])

aicandii igaDuck igaJuice 6.pm sleeping
 163 10331
next: 4848
aicandii https://www.youtube.com/channel/UClLfMpJycPsyR2xFlpWew7A
 4848 10331
otzdarva it's because bhvr are incompetent jackasses that couldn't design a game if the lives of thier loved ones depended on it
 2428 11503
next: 2983
otzdarva can u ready up? i gotta go to work soon
 2983 11503
next: 3513
otzdarva hey otz your eyes remind me of the stars in the sky so beautiful and bright can we kiss now
 3513 11503
next: 3554
otzdarva fucking camper that killer
 3554 11503
next: 3919
otzdarva @REALiNSaNgAMingGODPrODIgY no, we like killing, farming, fishing abd mining. adventure is also good too. but dating/rpgs is very girly. just from my experience and what my friends play.
 3919 11503
next: 4021
otzdarva @macadoww average stupid chatter :)
 4021 11503
next: 5226
otzdarva @SweetieOpaline im an extremely self-confident dominant guy, i enjoy to see the opposite in others.
 5226 11503
next: 9973
otzdarva @Otzdarva otz u should go 

In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [7]:
train.head()

Unnamed: 0,status,message,channel
39931,good,pepeJAMMER takeaway going cold\n,amouranth
31603,good,sorry to say sorry. XD\n,amouranth
27241,good,let's go Pog\n,amouranth
44304,good,Shake shake shake shake shake shake your bottom\n,amouranth
49078,good,maxyyNotes andyNerd jenp3Notes shadow685Lurk ...,amouranth


In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('bayes', MultinomialNB())])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

In [9]:
from sklearn.metrics import classification_report
good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 10288, 'bad': 1}
              precision    recall  f1-score   support

         bad       0.00      0.00      0.00       161
        good       0.98      1.00      0.99     10128

    accuracy                           0.98     10289
   macro avg       0.49      0.50      0.50     10289
weighted avg       0.97      0.98      0.98     10289



In [9]:
# Saker att testa: Support Vector Machines, XGBoost, K Nearest Neighbors, XLNet

import numpy as np
# Undersampling the training data
temp = train.groupby(['status']).size()
print(temp["bad"])

tempest = pd.DataFrame(train)
res = []
res.append(tempest[tempest.status == "bad"])
indices = tempest[tempest.status == "good"].index
random_indices = np.random.choice(indices, temp["bad"], replace=False)
res.append(tempest.loc[random_indices])
undersampled_train = pd.concat(res)
print(undersampled_train.groupby(['status']).size())
undersampled_train.head()


# Naive-Bayes with undersampled training data.

pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('bayes', MultinomialNB())])
pipe.fit(undersampled_train['message'], undersampled_train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

In [10]:
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('count_vectorizer', CountVectorizer()), 
                 ('tfidf', TfidfTransformer()),
                 ('bayes', MultinomialNB())])
grid = GridSearchCV(pipe, {
    'count_vectorizer__binary':(True, False),
    'count_vectorizer__ngram_range':((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),
    'bayes__alpha': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    'bayes__fit_prior': (True, False)
})
grid.fit(train['message'], train['status'])
predict = grid.predict(test['message'])

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

Best parameter (CV score=0.986):
{'bayes__alpha': 0.1, 'bayes__fit_prior': True, 'count_vectorizer__binary': True, 'count_vectorizer__ngram_range': (1, 1)}


In [11]:
# Naive Bayes with full data and selected parameters

pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=grid.best_params_["count_vectorizer__binary"],
        ngram_range=grid.best_params_["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=grid.best_params_["bayes__alpha"],
        fit_prior=grid.best_params_["bayes__fit_prior"]
    ))
])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 10244, 'bad': 45}
              precision    recall  f1-score   support

         bad       0.67      0.18      0.28       171
        good       0.99      1.00      0.99     10118

    accuracy                           0.98     10289
   macro avg       0.83      0.59      0.64     10289
weighted avg       0.98      0.98      0.98     10289



# Naive Bayes with undersampled data and selected parameters


pipe = Pipeline([('count_vectorizer', CountVectorizer(binary=True, ngram_range=(1, 1))), ('tfidf', TfidfTransformer()),('bayes', MultinomialNB(alpha=0.4, fit_prior=False))])
pipe.fit(undersampled_train['message'], undersampled_train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

In [10]:
from sklearn.svm import SVC

pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('svc', SVC())])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 10267, 'bad': 22}
              precision    recall  f1-score   support

         bad       0.73      0.10      0.17       161
        good       0.99      1.00      0.99     10128

    accuracy                           0.99     10289
   macro avg       0.86      0.55      0.58     10289
weighted avg       0.98      0.99      0.98     10289



pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('svc', SVC())])
pipe.fit(undersampled_train['message'], undersampled_train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

In [None]:
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('count_vectorizer', CountVectorizer()), 
                 ('tfidf', TfidfTransformer()),
                 ('svc', SVC())])
#grid = GridSearchCV(pipe, {
#    'count_vectorizer__binary':(True, False),
#    'count_vectorizer__ngram_range': ((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),
#    'svc__C': (0.1, 0.5, 1.0, 5.0, 10.0),
#    'svc__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
#    'svc__degree': (1, 2, 3, 4, 5, 10),
#    'svc__gamma': ("auto", "scale"),
#    'svc__shrinking': (True, False),
#    'svc__probability': (True, False),
#    'svc__tol': (0.001, 0.005, 0.01, 0.05, 0.1),
#    'svc__class_weight': (None, "balanced")})

grid = GridSearchCV(pipe, {
    'count_vectorizer__binary':[True],
    'count_vectorizer__ngram_range': [(1, 1)],
    'svc__C': (0.5, 1.0, 1.5),
    #'svc__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
    'svc__degree': (2, 3, 4),
    #'svc__gamma': ("auto", "scale"),
    #'svc__shrinking': (True, False),
    #'svc__probability': (True, False),
    #'svc__class_weight': (None, "balanced"),
    'svc__tol': (0.009, 0.01, 0.011)
})
grid.fit(train['message'], train['status'])
predict = grid.predict(test['message'])

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

In [None]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=grid.best_params_["count_vectorizer__binary"],
        ngram_range=grid.best_params_["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('svc', SVC(
        C=grid.best_params_["svc__C"],
        kernel=grid.best_params_["svc__kernel"],
        degree=grid.best_params_["svc__degree"],
        gamma=grid.best_params_["svc__gamma"],
        shrinking=grid.best_params_["svc__shrinking"],
        probability=grid.best_params_["svc__probability"],
        tol=grid.best_params_["svc__tol"],
        class_weight=grid.best_params_["svc__class_weight"]
    ))
])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))