In [6]:
import os

# TODO: add a field for trusted users. A user is trusted if they're a moderator or a long-time sub.
# TODO: Discard bot messages. Consider also discarding mod messages since they're often the only ones allowed to send links.

data = {} 
# of the form:
# {"channel1":
#    {
#     "messages": ["messsage1", "message2"...],
#     "bad_messages": [4, 18...], (indices of messages)
#     "viewers": 482 (average)
#    }
#  "channel2":
#   {...}
# }


# look through every data file
for filename in os.listdir("InitialTestData"):
    if os.path.isfile("InitialTestData/" + filename):
        # get channel name
        channel = filename.split("#")[1].split(".")[0]
        if not channel in data:
            data[channel] = {"viewers": [], "messages": [], "bad_messages": []}
        with open("InitialTestData/" + filename, encoding='utf-8') as file:
            lines = []
            for line in file.readlines():
                # only care about timestamped lines. Others are overhead data that we don't mind.
                if line[0] == "[":
                    # get rid of the timestamp, we only want the message itself.
                    line = line[line.find("] ")+2:]
                    # check for standard message sent by a user.
                    if line[0] == "<":
                        lines.append(line)
                    # check for overhead message stating viewer count.
                    elif line[0:8] == "VIEWERS:":
                        # for now add every viewer count to a list for averaging later on.
                        data[channel]["viewers"].append(int(line[9:].replace("\xa0", "")))
                    # check for overhead message stating a user was banned
                    elif line[0:4] == "BAN:":
                        # find most recent message sent by banned user and mark as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[5:].split(" ")[0] + ">" in msg:
                                data[channel]["bad_messages"].append(len(data[channel]["messages"]) + i)
                                break
                    # check for overhead message stating a message was deleted.
                    elif line[0:8] == "DELETED:":
                        # find the deleted message and mark it as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[9:].split(" (")[0] + "> " + line[line.find(" (")+2:-1] in msg:
                                data[channel]["bad_messages"].append(i)
                                break
            # sort bad message indices.
            data[channel]["bad_messages"] = sorted(set(data[channel]["bad_messages"]))
            # remove names from messages and add to data.
            for temp in lines:
                temp = temp[temp.find(">")+2:]
                # non-functioning attempt at anonomizing @-mentions.
                #index = temp.find("@")
                #while index != -1:
                #    temp.replace(temp[index:temp.find(" ", index)], "@user")
                #    index = temp.find("@", index+1)
                data[channel]["messages"].append(temp)
# average viewer counts by channel.
for channel in data.keys():
    avg_viewers = int(sum(data[channel]["viewers"]) / len(data[channel]["viewers"]))
    data[channel]["viewers"] = avg_viewers




In [7]:
for channel in data.keys():
    print("channel:", channel,
          "average viewers:", data[channel]["viewers"],
          "total messages:", len(data[channel]["messages"]),
          "bad messages:", len(data[channel]["bad_messages"]))


channel: aicandii average viewers: 720 total messages: 10331 bad messages: 2
channel: amouranth average viewers: 8123 total messages: 29611 bad messages: 749
channel: baalorlord average viewers: 1361 total messages: 5173 bad messages: 0
channel: cirno_tv average viewers: 503 total messages: 8819 bad messages: 0
channel: cohhcarnage average viewers: 12895 total messages: 54824 bad messages: 31
channel: fextralife average viewers: 13202 total messages: 10548 bad messages: 7
channel: otzdarva average viewers: 6723 total messages: 11503 bad messages: 10
channel: xqc average viewers: 51958 total messages: 364611 bad messages: 3682


In [8]:
discarded_channels = []
for channel in data.keys():
    if data[channel]["viewers"] >= 10000 or len(data[channel]["bad_messages"]) == 0:
        discarded_channels.append(channel)
        

In [9]:
import pandas as pd
formatted_data = []
for channel in data.keys():
    if channel in discarded_channels:
        continue
    next_bad = 0
    for index in range(len(data[channel]["messages"])):
        row = []
        if next_bad < len(data[channel]["bad_messages"]) and data[channel]["bad_messages"][next_bad] == index:
            print(
                channel,
                data[channel]["messages"][index],
                data[channel]["bad_messages"][next_bad],
                len(data[channel]["messages"])
            )
            row = ["bad", data[channel]["messages"][index], channel]
            next_bad += 1
            if next_bad < len(data[channel]["bad_messages"]):
                print("next:", data[channel]["bad_messages"][next_bad])
        else:
            row = ["good", data[channel]["messages"][index], channel]
        formatted_data.append(row)
df = pd.DataFrame(formatted_data, columns=["status", "message", "channel"])

aicandii igaDuck igaJuice 6.pm sleeping
 4298 10331
next: 8983
aicandii https://www.youtube.com/channel/UClLfMpJycPsyR2xFlpWew7A
 8983 10331
amouranth прувет
 65 29611
next: 71
amouranth можно подрочитиь
 71 29611
next: 76
amouranth Всем похуй на тебя, чучело
 76 29611
next: 132
amouranth folow me sometimes i strem cod need more folowers
 132 29611
next: 253
amouranth BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER BLACK LIVE MATTER
 253 29611
next: 305
amouranth ***
 305 29611
next: 344
amouranth Mluvím česky haha mám právo mluvit svým jazykem!
 344 29611
next: 416
amouran

In [10]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [11]:
train.head()

Unnamed: 0,status,message,channel
47887,good,eruption is fine COPIUM\n,otzdarva
34821,good,birthday pirate peepoClap\n,amouranth
4888,good,A vibe feel\n,aicandii
8323,good,aicGUN\n,aicandii
29947,good,HER SOCIAL MEDIA LINKS! 🍑 💦 https://downbad.co...,amouranth


In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('bayes', MultinomialNB())])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

In [13]:
from sklearn.metrics import classification_report
good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 10289, 'bad': 0}
              precision    recall  f1-score   support

         bad       0.00      0.00      0.00       158
        good       0.98      1.00      0.99     10131

    accuracy                           0.98     10289
   macro avg       0.49      0.50      0.50     10289
weighted avg       0.97      0.98      0.98     10289



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Saker att testa: Support Vector Machines, XGBoost, K Nearest Neighbors, XLNet

import numpy as np
# Undersampling the training data
temp = train.groupby(['status']).size()
print(temp["bad"])

tempest = pd.DataFrame(train)
res = []
res.append(tempest[tempest.status == "bad"])
indices = tempest[tempest.status == "good"].index
random_indices = np.random.choice(indices, temp["bad"], replace=False)
res.append(tempest.loc[random_indices])
undersampled_train = pd.concat(res)
print(undersampled_train.groupby(['status']).size())
undersampled_train.head()


# Naive-Bayes with undersampled training data.

pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('bayes', MultinomialNB())])
pipe.fit(undersampled_train['message'], undersampled_train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

In [14]:
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('count_vectorizer', CountVectorizer()), 
                 ('tfidf', TfidfTransformer()),
                 ('bayes', MultinomialNB())])
grid = GridSearchCV(pipe, {
    'count_vectorizer__binary':(True, False),
    'count_vectorizer__ngram_range':((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),
    'bayes__alpha': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    'bayes__fit_prior': (True, False)
})
grid.fit(train['message'], train['status'])
predict = grid.predict(test['message'])

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

Best parameter (CV score=0.986):
{'bayes__alpha': 0.1, 'bayes__fit_prior': True, 'count_vectorizer__binary': True, 'count_vectorizer__ngram_range': (1, 1)}


In [15]:
# Naive Bayes with full data and selected parameters

pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=grid.best_params_["count_vectorizer__binary"],
        ngram_range=grid.best_params_["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=grid.best_params_["bayes__alpha"],
        fit_prior=grid.best_params_["bayes__fit_prior"]
    ))
])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 10252, 'bad': 37}
              precision    recall  f1-score   support

         bad       0.65      0.15      0.25       158
        good       0.99      1.00      0.99     10131

    accuracy                           0.99     10289
   macro avg       0.82      0.58      0.62     10289
weighted avg       0.98      0.99      0.98     10289



# Naive Bayes with undersampled data and selected parameters


pipe = Pipeline([('count_vectorizer', CountVectorizer(binary=True, ngram_range=(1, 1))), ('tfidf', TfidfTransformer()),('bayes', MultinomialNB(alpha=0.4, fit_prior=False))])
pipe.fit(undersampled_train['message'], undersampled_train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

In [16]:
from sklearn.svm import SVC

pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('svc', SVC())])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 10277, 'bad': 12}
              precision    recall  f1-score   support

         bad       0.92      0.07      0.13       158
        good       0.99      1.00      0.99     10131

    accuracy                           0.99     10289
   macro avg       0.95      0.53      0.56     10289
weighted avg       0.98      0.99      0.98     10289



pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('svc', SVC())])
pipe.fit(undersampled_train['message'], undersampled_train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

In [17]:
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('count_vectorizer', CountVectorizer()), 
                 ('tfidf', TfidfTransformer()),
                 ('svc', SVC())])
#grid = GridSearchCV(pipe, {
#    'count_vectorizer__binary':(True, False),
#    'count_vectorizer__ngram_range': ((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),
#    'svc__C': (0.1, 0.5, 1.0, 5.0, 10.0),
#    'svc__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
#    'svc__degree': (1, 2, 3, 4, 5, 10),
#    'svc__gamma': ("auto", "scale"),
#    'svc__shrinking': (True, False),
#    'svc__probability': (True, False),
#    'svc__tol': (0.001, 0.005, 0.01, 0.05, 0.1),
#    'svc__class_weight': (None, "balanced")})

grid = GridSearchCV(pipe, {
    'count_vectorizer__binary':[True],
    'count_vectorizer__ngram_range': [(1, 1)],
    'svc__C': (0.5, 1.0, 1.5),
    #'svc__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
    'svc__degree': (2, 3, 4),
    #'svc__gamma': ("auto", "scale"),
    #'svc__shrinking': (True, False),
    #'svc__probability': (True, False),
    #'svc__class_weight': (None, "balanced"),
    'svc__tol': (0.009, 0.01, 0.011)
})
grid.fit(train['message'], train['status'])
predict = grid.predict(test['message'])

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

Best parameter (CV score=0.986):
{'count_vectorizer__binary': True, 'count_vectorizer__ngram_range': (1, 1), 'svc__C': 1.5, 'svc__degree': 2, 'svc__tol': 0.011}


In [19]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=grid.best_params_["count_vectorizer__binary"],
        ngram_range=grid.best_params_["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('svc', SVC(
        C=grid.best_params_["svc__C"],
        #kernel=grid.best_params_["svc__kernel"],
        degree=grid.best_params_["svc__degree"],
        #gamma=grid.best_params_["svc__gamma"],
        #shrinking=grid.best_params_["svc__shrinking"],
        #probability=grid.best_params_["svc__probability"],
        #class_weight=grid.best_params_["svc__class_weight"],
        tol=grid.best_params_["svc__tol"]
    ))
])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 10271, 'bad': 18}
              precision    recall  f1-score   support

         bad       0.89      0.10      0.18       158
        good       0.99      1.00      0.99     10131

    accuracy                           0.99     10289
   macro avg       0.94      0.55      0.59     10289
weighted avg       0.98      0.99      0.98     10289



NameError: name 'Pipeline' is not defined