In [2]:
import os

data = {} 
# of the form:
# {"channel1":
#    {
#     "messages": ["messsage1", "message2"...],
#     "bad_messages": [4, 18...], (indices of messages)
#     "viewers": 482 (average)
#    }
#  "channel2":
#   {...}
# }


# look through every data file
for filename in os.listdir("InitialTestData"):
    if os.path.isfile("InitialTestData/" + filename):
        # get channel name
        channel = filename.split("#")[1].split(".")[0]
        if not channel in data:
            data[channel] = {"viewers": [], "messages": [], "bad_messages": []}
        with open("InitialTestData/" + filename) as file:
            lines = []
            for line in file.readlines():
                # only care about timestamped lines. Others are overhead data that we don't mind.
                if line[0] == "[":
                    # get rid of the timestamp, we only want the message itself.
                    line = line[line.find("] ")+2:]
                    # check for standard message sent by a user.
                    if line[0] == "<":
                        lines.append(line)
                    # check for overhead message stating viewer count.
                    elif line[0:8] == "VIEWERS:":
                        # for now add every viewer count to a list for averaging later on.
                        data[channel]["viewers"].append(int(line[9:].replace("\xa0", "")))
                    # check for overhead message stating a user was banned
                    elif line[0:4] == "BAN:":
                        # find most recent message sent by banned user and mark as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[5:].split(" ")[0] + ">" in msg:
                                data[channel]["bad_messages"].append(len(data[channel]["messages"]) + i)
                                break
                    # check for overhead message stating a message was deleted.
                    elif line[0:8] == "DELETED:":
                        # find the deleted message and mark it as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[9:].split(" (")[0] + "> " + line[line.find(" (")+2:-1] in msg:
                                data[channel]["bad_messages"].append(i)
                                break
            # remove names from messages and add to data.
            for temp in lines:
                temp = temp[temp.find(">")+2:]
                # non-functioning attempt at anonomizing @-mentions.
                #index = temp.find("@")
                #while index != -1:
                #    temp.replace(temp[index:temp.find(" ", index)], "@user")
                #    index = temp.find("@", index+1)
                data[channel]["messages"].append(temp)
# average viewer counts by channel.
for channel in data.keys():
    avg_viewers = int(sum(data[channel]["viewers"]) / len(data[channel]["viewers"]))
    data[channel]["viewers"] = avg_viewers




In [15]:
for channel in data.keys():
    print("channel:", channel,
          "average viewers:", data[channel]["viewers"],
          "total messages:", len(data[channel]["messages"]),
          "bad messages:", len(data[channel]["bad_messages"]))


channel: baalorlord average viewers: 1361 total messages: 5173 bad messages: 0
channel: cirno_tv average viewers: 503 total messages: 8819 bad messages: 0
channel: aicandii average viewers: 720 total messages: 10331 bad messages: 2
channel: otzdarva average viewers: 6723 total messages: 11503 bad messages: 10
channel: amouranth average viewers: 8123 total messages: 29611 bad messages: 790
channel: fextralife average viewers: 13202 total messages: 10548 bad messages: 8
channel: xqc average viewers: 51958 total messages: 364611 bad messages: 3818
channel: cohhcarnage average viewers: 12895 total messages: 54824 bad messages: 45


In [16]:
discarded_channels = []
for channel in data.keys():
    if data[channel]["viewers"] >= 10000 or len(data[channel]["bad_messages"]) == 0:
        discarded_channels.append(channel)
        

In [17]:
import pandas as pd
formatted_data = []
for channel in data.keys():
    if channel in discarded_channels:
        continue
    next_bad = 0
    for index in range(len(data[channel]["messages"])):
        row = []
        if next_bad < len(data[channel]["bad_messages"]) and data[channel]["bad_messages"][next_bad] == index:
            row = ["bad", data[channel]["messages"][index], channel]
        else:
            row = ["good", data[channel]["messages"][index], channel]
        formatted_data.append(row)
df = pd.DataFrame(formatted_data, columns=["status", "message", "channel"])

In [18]:
df.head()

Unnamed: 0,status,message,channel
0,good,!slots 10000\n,aicandii
1,good,@feiashtear you got OpieOP | 4Head | Kappa and...,aicandii
2,good,!gamble 20000\n,aicandii
3,good,feiashtear lost 20000 pp in roulette and now ...,aicandii
4,good,!gamble 30000\n,aicandii
