In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import gc

In [2]:
# TODO: add a field for trusted users. A user is trusted if they're a moderator or a long-time sub.
# TODO: Discard bot messages. Consider also discarding mod messages since they're often the only ones allowed to send links.
train = 0
test = 0
data = {} 
# of the form:
# {"channel1":
#    {
#     "messages": ["messsage1", "message2"...],
#     "bad_messages": [4, 18...], (indices of messages)
#     "viewers": 482 (average)
#    }
#  "channel2":
#   {...}
# }


# look through every data file
for filename in os.listdir("FullData"):
    if os.path.isfile("FullData/" + filename):
        # get channel name
        channel = filename.split("#")[1].split(".")[0]
        if not channel in data:
            data[channel] = {"viewers": [], "messages": [], "bad_messages": []}
        with open("FullData/" + filename, encoding='utf-8') as file:
            lines = []
            for line in file.readlines():
                # only care about timestamped lines. Others are overhead data that we don't mind.
                if line[0] == "[":
                    # get rid of the timestamp, we only want the message itself.
                    line = line[line.find("] ")+2:]
                    # check for standard message sent by a user.
                    if line[0] == "<":
                        lines.append(line)
                    # check for overhead message stating viewer count.
                    elif line[0:8] == "VIEWERS:":
                        # for now add every viewer count to a list for averaging later on.
                        data[channel]["viewers"].append(int(line[9:].replace("\xa0", "")))
                    # check for overhead message stating a user was banned
                    elif line[0:4] == "BAN:":
                        # find most recent message sent by banned user and mark as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[5:].split(" ")[0] + ">" in msg:
                                data[channel]["bad_messages"].append(len(data[channel]["messages"]) + i)
                                break
                    # check for overhead message stating a message was deleted.
                    elif line[0:8] == "DELETED:":
                        # find the deleted message and mark it as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[9:].split(" (")[0] + "> " + line[line.find(" (")+2:-1] in msg:
                                data[channel]["bad_messages"].append(i)
                                break
            # sort bad message indices.
            data[channel]["bad_messages"] = sorted(set(data[channel]["bad_messages"]))
            # remove names from messages and add to data.
            for temp in lines:
                temp = temp[temp.find(">")+2:]
                # non-functioning attempt at anonomizing @-mentions.
                #index = temp.find("@")
                #while index != -1:
                #    temp.replace(temp[index:temp.find(" ", index)], "@user")
                #    index = temp.find("@", index+1)
                data[channel]["messages"].append(temp)
# average viewer counts by channel and remove channels without viewer data.
removals = []
for channel in data.keys():
    if len(data[channel]["viewers"]) == 0:
        removals.append(channel)
    else:
        avg_viewers = int(sum(data[channel]["viewers"]) / len(data[channel]["viewers"]))
        data[channel]["viewers"] = avg_viewers
for channel in removals:
    data.pop(channel)
discarded_channels = []
for channel in data.keys():
    if data[channel]["viewers"] >= 10000 or len(data[channel]["bad_messages"]) == 0:
        discarded_channels.append(channel)

formatted_data = []
for channel in data.keys():
    if channel in discarded_channels:
        continue
    next_bad = 0
    for index in range(len(data[channel]["messages"])):
        row = []
        if next_bad < len(data[channel]["bad_messages"]) and data[channel]["bad_messages"][next_bad] == index:
            row = ["bad", data[channel]["messages"][index], channel]
            next_bad += 1
        else:
            row = ["good", data[channel]["messages"][index], channel]
        formatted_data.append(row)
df = pd.DataFrame(formatted_data, columns=["status", "message", "channel"])
# Undersampling the training data
temp = df.groupby(['status']).size()

res = []
res.append(df[df.status == "bad"])
indices = df[df.status == "good"].index
random_indices = np.random.choice(indices, temp["bad"]*9, replace=False)
res.append(df.loc[random_indices])
undersampled_data = pd.concat(res)
train, test = train_test_split(undersampled_data, test_size=0.2)

In [6]:
del(df)
del(data)
del(lines)
del(formatted_data)
del(temp)
del(res)
del(indices)
gc.collect()

0

In [7]:
train.head()

Unnamed: 0,status,message,channel
21159337,good,do it anyway\n,elajjaz
19300702,good,Happy near year guys\n,dannyaarons
24211179,good,nerf hunters? why ?\n,gingitv
21433948,good,BBoomer\n,elajjaz
11850494,good,LUL\n,39daph


from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('bayes', MultinomialNB())])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

from sklearn.metrics import classification_report
good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

In [9]:
# Saker att testa: Support Vector Machines, XGBoost, K Nearest Neighbors, XLNet

# Naive-Bayes with undersampled training data.

pipe = Pipeline([('count_vectorizer', CountVectorizer(binary=True)), ('tfidf', TfidfTransformer()),('bayes', MultinomialNB())])
pipe.fit(undersampled_train['message'], undersampled_train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('count_vectorizer', CountVectorizer()), 
                 ('tfidf', TfidfTransformer()),
                 ('bayes', MultinomialNB())])
grid = GridSearchCV(pipe, {
    'count_vectorizer__binary':(True, False),
    'count_vectorizer__ngram_range':((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),
    'bayes__alpha': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    'bayes__fit_prior': (True, False)
})
grid.fit(train['message'], train['status'])
predict = grid.predict(test['message'])

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

In [8]:
# Naive Bayes with full data and selected parameters


pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=True
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        #alpha=grid.best_params_["bayes__alpha"],
        #fit_prior=grid.best_params_["bayes__fit_prior"]
    ))
])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 103199, 'bad': 2229}
              precision    recall  f1-score   support

         bad       0.94      0.20      0.33     10347
        good       0.92      1.00      0.96     95081

    accuracy                           0.92    105428
   macro avg       0.93      0.60      0.65    105428
weighted avg       0.92      0.92      0.90    105428



# Naive Bayes with undersampled data and selected parameters


pipe = Pipeline([('count_vectorizer', CountVectorizer(binary=True, ngram_range=(1, 1))), ('tfidf', TfidfTransformer()),('bayes', MultinomialNB(alpha=0.4, fit_prior=False))])
pipe.fit(undersampled_train['message'], undersampled_train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

In [9]:
pipe = Pipeline([('count_vectorizer', CountVectorizer(binary=True)), ('tfidf', TfidfTransformer()), ('scaler', StandardScaler(with_mean=False)), ('svc', LinearSVC())])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))



{'good': 95677, 'bad': 9751}
              precision    recall  f1-score   support

         bad       0.39      0.37      0.38     10347
        good       0.93      0.94      0.93     95081

    accuracy                           0.88    105428
   macro avg       0.66      0.65      0.66    105428
weighted avg       0.88      0.88      0.88    105428



In [10]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=True)),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 50000, dual = False))
])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 99829, 'bad': 5599}
              precision    recall  f1-score   support

         bad       0.62      0.34      0.44     10347
        good       0.93      0.98      0.95     95081

    accuracy                           0.91    105428
   macro avg       0.78      0.66      0.70    105428
weighted avg       0.90      0.91      0.90    105428



In [11]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=True)),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 50000, dual = False, class_weight='balanced'))
])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

{'good': 78929, 'bad': 26499}
              precision    recall  f1-score   support

         bad       0.26      0.68      0.38     10347
        good       0.96      0.80      0.87     95081

    accuracy                           0.78    105428
   macro avg       0.61      0.74      0.62    105428
weighted avg       0.89      0.78      0.82    105428



pipe = Pipeline([('count_vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()),('svc', SVC())])
pipe.fit(undersampled_train['message'], undersampled_train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))

In [29]:

pipe = Pipeline([('count_vectorizer', CountVectorizer(binary=True)), 
                 ('tfidf', TfidfTransformer()),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('svc', LinearSVC(dual = False))])
#grid = GridSearchCV(pipe, {
#    'count_vectorizer__binary':(True, False),
#    'count_vectorizer__ngram_range': ((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),
#    'svc__C': (0.1, 0.5, 1.0, 5.0, 10.0),
#    'svc__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
#    'svc__degree': (1, 2, 3, 4, 5, 10),
#    'svc__gamma': ("auto", "scale"),
#    'svc__shrinking': (True, False),
#    'svc__probability': (True, False),
#    'svc__tol': (0.001, 0.005, 0.01, 0.05, 0.1),
#    'svc__class_weight': (None, "balanced")})

grid = GridSearchCV(pipe, {
    'svc__C': (0.8, 0.9, 1.0, 1.1),
    #'svc__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
    #'svc__degree': (2, 3, 4),
    #'svc__gamma': ("auto", "scale"),
    #'svc__shrinking': (True, False),
    #'svc__probability': (True, False),
    'svc__class_weight': (None, "balanced"),
    #'svc__tol': (0.009, 0.01, 0.011)
})
grid.fit(train['message'], train['status'])
predict = grid.predict(test['message'])

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)



Best parameter (CV score=0.973):
{'svc__C': 0.8, 'svc__class_weight': None}




In [31]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=True
    )),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(
        C=grid.best_params_["svc__C"],
        #kernel=grid.best_params_["svc__kernel"],
        #degree=grid.best_params_["svc__degree"],
        #gamma=grid.best_params_["svc__gamma"],
        #shrinking=grid.best_params_["svc__shrinking"],
        #probability=grid.best_params_["svc__probability"],
        class_weight=grid.best_params_["svc__class_weight"],
        #tol=grid.best_params_["svc__tol"],
        dual=False,
        max_iter=10000
    ))
])
pipe.fit(train['message'], train['status'])

predict = pipe.predict(test['message'])

good_bad_count = {"good": 0, "bad": 0}
for guess in predict:
        good_bad_count[guess] += 1
print(good_bad_count)
print(classification_report(test['status'], predict))



{'good': 10090, 'bad': 199}
              precision    recall  f1-score   support

         bad       0.18      0.23      0.20       149
        good       0.99      0.98      0.99     10140

    accuracy                           0.97     10289
   macro avg       0.58      0.61      0.59     10289
weighted avg       0.98      0.97      0.97     10289



NameError: name 'Pipeline' is not defined