In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [2]:
channeltags = {
    "rdulive": {"Vtuber": False, "Male": True},
    "beardageddon": {"Vtuber": False, "Male": True},
    "giantwaffle": {"Vtuber": False, "Male": True},
    "aicandii": {"Vtuber": True, "Male": False},
    "clauvio": {"Vtuber": True, "Male": False},
    "enviosity": {"Vtuber": False, "Male": True},
    "amnotasadist": {"Vtuber": True, "Male": True},
    "bateson87": {"Vtuber": False, "Male": True},
    "itsjstn": {"Vtuber": False, "Male": True},
    "cirno_tv": {"Vtuber": True, "Male": True},
    "robraven": {"Vtuber": False, "Male": True},
    "maxylobes": {"Vtuber": False, "Male": True},
    "asianguystream": {"Vtuber": False, "Male": True},
    "zentreya": {"Vtuber": True, "Male": False},
    "iahfy": {"Vtuber": True, "Male": False},
    "alicesawyer": {"Vtuber": True, "Male": False},
    "singsing": {"Vtuber": False, "Male": True},
    "burkeblack": {"Vtuber": False, "Male": True},
    "fextralife": {"Vtuber": False, "Male": True},
    "benfruit": {"Vtuber": False, "Male": True},
    "blinkx_": {"Vtuber": False, "Male": False},
    "aavak": {"Vtuber": False, "Male": True},
    "gothicbunni": {"Vtuber": True, "Male": False},
    "anniefuchsia": {"Vtuber": False, "Male": False},
    "hera": {"Vtuber": False, "Male": True},
    "therunningmanz": {"Vtuber": False, "Male": True},
    "vanessa_lopez_official": {"Vtuber": False, "Male": False},
    "tokki": {"Vtuber": False, "Male": False},
    "preachlfw": {"Vtuber": False, "Male": True},
    "sharonqueen": {"Vtuber": False, "Male": False},
    "ballindani": {"Vtuber": True, "Male": False},
    "javathecuptv": {"Vtuber": True, "Male": True},
    "lewpac": {"Vtuber": False, "Male": True},
    "sunshinevrc": {"Vtuber": True, "Male": False},
    "danehearth": {"Vtuber": False, "Male": True},
    "barbarousking": {"Vtuber": False, "Male": True},
    "tenha": {"Vtuber": False, "Male": True},
    "naguura": {"Vtuber": False, "Male": False},
    "veibae": {"Vtuber": True, "Male": False},
    "dexbonus": {"Vtuber": False, "Male": False},
    "retrogaijin": {"Vtuber": False, "Male": True},
    "faide": {"Vtuber": False, "Male": True},
    "dangheesling": {"Vtuber": False, "Male": True},
    "harukakaribu": {"Vtuber": True, "Male": False},
    "ds_lily": {"Vtuber": False, "Male": False},
    "orzanel": {"Vtuber": False, "Male": True},
    "peachmilky": {"Vtuber": True, "Male": False},
    "kandyland": {"Vtuber": False, "Male": False},
    "teepee": {"Vtuber": False, "Male": True},
    "xqc": {"Vtuber": False, "Male": True},
    "atk": {"Vtuber": False, "Male": True},
    "aiyanya": {"Vtuber": True, "Male": False},
    "faxuty": {"Vtuber": False, "Male": True},
    "pengu": {"Vtuber": False, "Male": True},
    "lyasyaa": {"Vtuber": False, "Male": False},
    "arnie": {"Vtuber": False, "Male": True},
    "fatalseductions": {"Vtuber": False, "Male": False},
    "cr1tdota": {"Vtuber": False, "Male": True},
    "missmikkaa": {"Vtuber": False, "Male": False},
    "thejrm_": {"Vtuber": False, "Male": True},
    "bestdadtuber": {"Vtuber": True, "Male": True},
    "elosanta": {"Vtuber": False, "Male": True},
    "imls": {"Vtuber": False, "Male": True},
    "ariasaki": {"Vtuber": False, "Male": False},
    "starsmitten": {"Vtuber": True, "Male": False},
    "basilwoof": {"Vtuber": True, "Male": True},
    "cerianmusic": {"Vtuber": False, "Male": False},
    "dyanna": {"Vtuber": False, "Male": False},
    "abdulhd": {"Vtuber": False, "Male": True},
    "spear_shot": {"Vtuber": False, "Male": True},
    "flats": {"Vtuber": False, "Male": True},
    "boraslegend": {"Vtuber": False, "Male": True},
    "trickywi": {"Vtuber": True, "Male": False},
    "gingitv": {"Vtuber": False, "Male": True},
    "maggiekarp_": {"Vtuber": False, "Male": False},
    "axel_tv": {"Vtuber": False, "Male": True},
    "72hrs": {"Vtuber": False, "Male": True},
    "mitchjones": {"Vtuber": False, "Male": True},
    "gateoo": {"Vtuber": True, "Male": True},
    "katarinafps": {"Vtuber": False, "Male": False},
    "kurumii": {"Vtuber": True, "Male": False},
    "jokerdtv": {"Vtuber": False, "Male": True},
    "yuzupyon": {"Vtuber": True, "Male": False},
    "bunniejin": {"Vtuber": False, "Male": False},
    "thestockguy": {"Vtuber": False, "Male": True},
    "vombuz": {"Vtuber": False, "Male": True},
    "oroboro": {"Vtuber": False, "Male": True},
    "limealicious": {"Vtuber": True, "Male": False},
    "aleks": {"Vtuber": False, "Male": True},
    "ikumi": {"Vtuber": True, "Male": False},
    "saintvicious": {"Vtuber": False, "Male": True},
    "kragiee": {"Vtuber": False, "Male": True},
    "maven": {"Vtuber": False, "Male": True},
    "meatihs": {"Vtuber": False, "Male": True},
    "alkaizerx": {"Vtuber": False, "Male": True},
    "hambinooo": {"Vtuber": False, "Male": True},
    "robcdee": {"Vtuber": False, "Male": True},
    "emilyywang": {"Vtuber": False, "Male": False},
    "baker": {"Vtuber": False, "Male": True},
    "travpiper": {"Vtuber": True, "Male": False},
    "nalithea": {"Vtuber": False, "Male": True},
    "lhcloudy27-1": {"Vtuber": False, "Male": True},
    "carmilllia": {"Vtuber": True, "Male": False},
    "chrisnxtdoor": {"Vtuber": False, "Male": True},
    "jhinxx_": {"Vtuber": True, "Male": False},
    "ainrun": {"Vtuber": False, "Male": True},
    "greekgodx": {"Vtuber": False, "Male": True},
    "martyguyz": {"Vtuber": False, "Male": True},
    "akiwoo": {"Vtuber": True, "Male": False},
    "gammainkk": {"Vtuber": True, "Male": False},
    "dunkorslam": {"Vtuber": False, "Male": True},
    "liqudwifi": {"Vtuber": False, "Male": True},
    "merk": {"Vtuber": False, "Male": True},
    "yagurlshelly": {"Vtuber": True, "Male": False},
    "antielitz": {"Vtuber": False, "Male": True},
    "shaiquera": {"Vtuber": False, "Male": False},
    "nepenthez": {"Vtuber": False, "Male": True},
    "kxpture": {"Vtuber": False, "Male": True},
    "spyro_za": {"Vtuber": False, "Male": True},
    "thatpunchkid": {"Vtuber": False, "Male": True},
    "leopard": {"Vtuber": False, "Male": True},
    "babynikki": {"Vtuber": False, "Male": False},
    "dailydasher": {"Vtuber": False, "Male": True},
    "klean": {"Vtuber": False, "Male": True},
    "snuffy": {"Vtuber": True, "Male": False},
    "unknownxarmy": {"Vtuber": False, "Male": True},
    "chrisheroes": {"Vtuber": False, "Male": True},
    "rubee": {"Vtuber": True, "Male": False},
    "apricot": {"Vtuber": True, "Male": False},
    "break": {"Vtuber": False, "Male": True},
    "xaeela": {"Vtuber": True, "Male": False},
    "ariannafoxton": {"Vtuber": True, "Male": False},
    "sukidingels": {"Vtuber": True, "Male": False},
    "shuusakurai": {"Vtuber": True, "Male": True},
    "ninaninin": {"Vtuber": True, "Male": False},
    "niamocha": {"Vtuber": True, "Male": False},
    "dyarikku": {"Vtuber": True, "Male": False},
    "thijs": {"Vtuber": False, "Male": True},
    "p4wnyhof": {"Vtuber": False, "Male": True},
    "lilsimsie": {"Vtuber": False, "Male": False},
    "redbeard": {"Vtuber": False, "Male": True},
    "sleepy_project": {"Vtuber": True, "Male": False},
    "husher_x": {"Vtuber": True, "Male": False},
    "deltiasgaming": {"Vtuber": False, "Male": True},
    "39daph": {"Vtuber": True, "Male": False},
    "skill4ltu": {"Vtuber": False, "Male": True},
    "otzdarva": {"Vtuber": False, "Male": True},
    "oliviamonroe": {"Vtuber": True, "Male": False},
    "murdercrumpet": {"Vtuber": True, "Male": False},
    "jingggxd": {"Vtuber": False, "Male": True},
    "prod": {"Vtuber": False, "Male": True},
    "dinossindgeil": {"Vtuber": False, "Male": True},
    "grimmivt": {"Vtuber": True, "Male": False},
    "beast1k": {"Vtuber": False, "Male": True},
    "ksonsouchou": {"Vtuber": True, "Male": False},
    "mary": {"Vtuber": False, "Male": False},
    "girl_dm_": {"Vtuber": True, "Male": False},
    "nyanners": {"Vtuber": True, "Male": False},
    "hiswattson": {"Vtuber": False, "Male": True},
    "shadeless": {"Vtuber": False, "Male": True},
    "ironmouse": {"Vtuber": True, "Male": False},
    "aspiringspike": {"Vtuber": False, "Male": True},
    "lol_nemesis": {"Vtuber": False, "Male": True},
    "zethiann": {"Vtuber": False, "Male": True},
    "baalorlord": {"Vtuber": False, "Male": True},
    "curvyelephant": {"Vtuber": False, "Male": True},
    "pokelawls": {"Vtuber": False, "Male": True},
    "emongg": {"Vtuber": False, "Male": True},
    "dezignful": {"Vtuber": False, "Male": True},
    "kyedae": {"Vtuber": False, "Male": False},
    "incon": {"Vtuber": False, "Male": True},
    "silvervale": {"Vtuber": True, "Male": False},
    "zachmazer": {"Vtuber": False, "Male": True},
    "amaz": {"Vtuber": False, "Male": True},
    "ambiguousamphibian": {"Vtuber": False, "Male": True},
    "a2guapo": {"Vtuber": False, "Male": True},
    "ohmwrecker": {"Vtuber": False, "Male": True},
    "faith": {"Vtuber": False, "Male": False},
    "willerz": {"Vtuber": False, "Male": True},
    "cringer": {"Vtuber": False, "Male": True},
    "salt": {"Vtuber": False, "Male": True},
    "olofmeister": {"Vtuber": False, "Male": True},
    "broyouwack": {"Vtuber": False, "Male": True},
    "stal": {"Vtuber": True, "Male": False},
    "feer": {"Vtuber": False, "Male": True},
    "lhcloudy27": {"Vtuber": False, "Male": True},
    "sevensins": {"Vtuber": False, "Male": True},
    "gmhikaru": {"Vtuber": False, "Male": True},
    "scrapie": {"Vtuber": False, "Male": True},
    "jonbams": {"Vtuber": False, "Male": True},
    "abe": {"Vtuber": True, "Male": True},
    "aceofspadesow": {"Vtuber": False, "Male": True},
    "bogur": {"Vtuber": False, "Male": True},
    "komemos": {"Vtuber": False, "Male": True},
    "spicyuuu": {"Vtuber": False, "Male": False},
    "goldbridgetv": {"Vtuber": False, "Male": True},
    "noodlewolfvt": {"Vtuber": True, "Male": False},
    "cohhcarnage": {"Vtuber": False, "Male": True},
    "hiko": {"Vtuber": False, "Male": True},
    "ac7ionman": {"Vtuber": False, "Male": True},
    "moosebrother": {"Vtuber": False, "Male": True},
    "maudado": {"Vtuber": False, "Male": True},
    "nediavr": {"Vtuber": True, "Male": False},
    "sorry": {"Vtuber": True, "Male": False},
    "sp4zie": {"Vtuber": False, "Male": True},
    "kyokeru": {"Vtuber": True, "Male": False},
    "shrodingerlee": {"Vtuber": False, "Male": True},
    "overezeggs": {"Vtuber": True, "Male": False},
    "rennslyaer": {"Vtuber": True, "Male": False},
    "dannyaarons": {"Vtuber": False, "Male": True},
    "nohandsgamer": {"Vtuber": False, "Male": True},
    "okcode": {"Vtuber": True, "Male": True},
    "hashtag_stokes": {"Vtuber": False, "Male": True},
    "trey24k": {"Vtuber": False, "Male": True},
    "hens333": {"Vtuber": False, "Male": True},
    "kuugels": {"Vtuber": False, "Male": False},
    "admiralbulldog": {"Vtuber": False, "Male": True},
    "naowh": {"Vtuber": False, "Male": True},
    "taxi2g": {"Vtuber": False, "Male": True},
    "peeve": {"Vtuber": False, "Male": True},
    "elajjaz": {"Vtuber": False, "Male": True},
    "longlivequebec": {"Vtuber": False, "Male": True},
    "zealsambitious": {"Vtuber": False, "Male": True},
    "kyundere": {"Vtuber": True, "Male": False},
    "negnasu": {"Vtuber": True, "Male": False},
    "shadder2k": {"Vtuber": False, "Male": True},
    "nugiyen": {"Vtuber": False, "Male": True},
    "projektmelody": {"Vtuber": True, "Male": False},
    "kawaiigrin": {"Vtuber": True, "Male": False},
    "nymn": {"Vtuber": False, "Male": True},
    "zyblol": {"Vtuber": False, "Male": True},
    "gorgc": {"Vtuber": False, "Male": True},
    "circon": {"Vtuber": False, "Male": True},
    "castro_1021": {"Vtuber": False, "Male": True},
    "sinatraa": {"Vtuber": False, "Male": True},
    "tobii": {"Vtuber": False, "Male": True},
    "manvsgame": {"Vtuber": False, "Male": True},
    "florryworry": {"Vtuber": False, "Male": True},
    "2dkiri": {"Vtuber": False, "Male": False},
    "chey": {"Vtuber": True, "Male": False},
    "tobs": {"Vtuber": True, "Male": False},
    "tanetenshi": {"Vtuber": True, "Male": False},
    "midbeast": {"Vtuber": False, "Male": True},
    "onikanavt": {"Vtuber": True, "Male": False},
    "guru": {"Vtuber": False, "Male": True},
    "memorizer92": {"Vtuber": False, "Male": True},
    "meowmoonified": {"Vtuber": True, "Male": False},
    "thecrimsonblur": {"Vtuber": False, "Male": True},
    "jenazad": {"Vtuber": False, "Male": True},
    "diegosaurs": {"Vtuber": False, "Male": True},
    "unrational": {"Vtuber": False, "Male": True},
    "tsm_viss": {"Vtuber": False, "Male": True},
    "ceremor": {"Vtuber": False, "Male": True},
    "hirona": {"Vtuber": False, "Male": False},
    "zetalot": {"Vtuber": False, "Male": True},
    "momo": {"Vtuber": True, "Male": False},
    "fragnance": {"Vtuber": False, "Male": True},
    "iddqd": {"Vtuber": False, "Male": True},
    "munchking": {"Vtuber": False, "Male": True},
    "solorenektononly": {"Vtuber": False, "Male": True},
    "danucd": {"Vtuber": False, "Male": False},
    "hutchmf": {"Vtuber": False, "Male": True},
    "butters": {"Vtuber": False, "Male": True},
    "develique": {"Vtuber": False, "Male": False},
    "auteru": {"Vtuber": True, "Male": False},
    "mande": {"Vtuber": False, "Male": True},
    "tgh_sr": {"Vtuber": False, "Male": True},
    "keshaeuw": {"Vtuber": False, "Male": True},
    "uhsnow": {"Vtuber": False, "Male": True},
    "canniny": {"Vtuber": False, "Male": False},
    "richard_hammer": {"Vtuber": False, "Male": True},
    "foxplushy": {"Vtuber": True, "Male": False},
    "sequisha": {"Vtuber": False, "Male": True},
    "heyzeusherestoast": {"Vtuber": False, "Male": True},
    "tommykaylive": {"Vtuber": False, "Male": True},
    "kyliebitkin": {"Vtuber": False, "Male": False},
    "nidas": {"Vtuber": False, "Male": True},
    "jackcashew": {"Vtuber": False, "Male": True},
    "pohx": {"Vtuber": False, "Male": True},
    "amouranth": {"Vtuber": False, "Male": False},
    "sakuratsubasa": {"Vtuber": True, "Male": False},
    "vividlyvivi": {"Vtuber": True, "Male": False},
    "mmorpg": {"Vtuber": False, "Male": True},
    "tenma": {"Vtuber": True, "Male": False},
    "gingy": {"Vtuber": False, "Male": False},
    "tenz": {"Vtuber": False, "Male": True},
    "omie": {"Vtuber": False, "Male": True},
    "sick_nerd": {"Vtuber": False, "Male": True},
    "ming": {"Vtuber": False, "Male": True},
    "premiertwo": {"Vtuber": False, "Male": True},
    "cookielolxx": {"Vtuber": False, "Male": True},
    "bobbypoffgaming": {"Vtuber": False, "Male": True},
    "sco": {"Vtuber": False, "Male": True},
    "polysypher": {"Vtuber": False, "Male": True},
    "spammiej": {"Vtuber": False, "Male": True},
    "fenohs": {"Vtuber": False, "Male": True},
    "timo_redbeard": {"Vtuber": False, "Male": True},
    "loeya": {"Vtuber": False, "Male": False},
    "birdieboba": {"Vtuber": True, "Male": False},
    "jpf_14-1": {"Vtuber": False, "Male": True},
    "nats": {"Vtuber": False, "Male": True},
    "donkoalachannel": {"Vtuber": True, "Male": True},
    "cerisevt": {"Vtuber": True, "Male": False},
    "jpf_14": {"Vtuber": False, "Male": True},
    "lumituber": {"Vtuber": True, "Male": False},
    "deadlyslob": {"Vtuber": False, "Male": True},
    "marimari_en": {"Vtuber": True, "Male": False}
}

In [3]:
# TODO: add a field for trusted users. A user is trusted if they're a moderator or a long-time sub.
# TODO: Discard bot messages. Consider also discarding mod messages since they're often the only ones allowed to send links.
data = {} 
# of the form:
# {"channel1":
#    {
#     "messages": ["messsage1", "message2"...],
#     "bad_messages": [4, 18...], (indices of messages)
#     "viewers": 482 (average)
#    }
#  "channel2":
#   {...}
# }


# look through every data file
for filename in os.listdir("FullData"):
    if os.path.isfile("FullData/" + filename):
        # get channel name
        channel = filename.split("#")[1].split(".")[0]
        if not channel in data:
            data[channel] = {"viewers": [], "messages": [], "bad_messages": []}
        with open("FullData/" + filename, encoding='utf-8') as file:
            lines = []
            for line in file.readlines():
                # only care about timestamped lines. Others are overhead data that we don't mind.
                if line[0] == "[":
                    # get rid of the timestamp, we only want the message itself.
                    line = line[line.find("] ")+2:]
                    # check for standard message sent by a user.
                    if line[0] == "<":
                        lines.append(line)
                    # check for overhead message stating viewer count.
                    elif line[0:8] == "VIEWERS:":
                        # for now add every viewer count to a list for averaging later on.
                        data[channel]["viewers"].append(int(line[9:].replace("\xa0", "")))
                    # check for overhead message stating a user was banned
                    elif line[0:4] == "BAN:":
                        # find most recent message sent by banned user and mark as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[5:].split(" ")[0] + ">" in msg:
                                data[channel]["bad_messages"].append(len(data[channel]["messages"]) + i)
                                break
                    # check for overhead message stating a message was deleted.
                    elif line[0:8] == "DELETED:":
                        # find the deleted message and mark it as bad message.
                        for i, msg in reversed(list(enumerate(lines))):
                            if line[9:].split(" (")[0] + "> " + line[line.find(" (")+2:-1] in msg:
                                data[channel]["bad_messages"].append(i)
                                break
            # sort bad message indices.
            data[channel]["bad_messages"] = sorted(set(data[channel]["bad_messages"]))
            # remove names from messages and add to data.
            for temp in lines:
                temp = temp[temp.find(">")+2:]
                # non-functioning attempt at anonomizing @-mentions.
                #index = temp.find("@")
                #while index != -1:
                #    temp.replace(temp[index:temp.find(" ", index)], "@user")
                #    index = temp.find("@", index+1)
                data[channel]["messages"].append(temp)
# average viewer counts by channel and remove channels without viewer data.
removals = ["liqudwifi", "chrisheroes", "maudado", "ac7ionman", "shrodingerlee", "zealsambitious", "jpf_14-1", "jpf_14"]
for channel in data.keys():
    if len(data[channel]["viewers"]) == 0:
        if channel not in removals:
            removals.append(channel)
    else:
        avg_viewers = int(sum(data[channel]["viewers"]) / len(data[channel]["viewers"]))
        data[channel]["viewers"] = avg_viewers
for channel in removals:
    data.pop(channel)
discarded_channels = []
for channel in data.keys():
    if data[channel]["viewers"] >= 10000 or len(data[channel]["bad_messages"]) == 0:
        discarded_channels.append(channel)

formatted_data = []
# 293 channels
for channel in data.keys():
    if channel in discarded_channels:
        continue
    next_bad = 0
    for index in range(len(data[channel]["messages"])):
        row = []
        if next_bad < len(data[channel]["bad_messages"]) and data[channel]["bad_messages"][next_bad] == index:
            row = ["bad", data[channel]["messages"][index], channel, channeltags[channel]["Vtuber"], channeltags[channel]["Male"]]
            next_bad += 1
        else:
            row = ["good", data[channel]["messages"][index], channel, channeltags[channel]["Vtuber"], channeltags[channel]["Male"]]
        formatted_data.append(row)
del(data)
df = pd.DataFrame(formatted_data, columns=["status", "message", "channel", "Vtuber", "Male"])
del(formatted_data)

# Undersampling the training data
temp = df.groupby(['status']).size()
res = []
res.append(df[df.status == "bad"])
indices = df[df.status == "good"].index
random_indices = np.random.choice(indices, temp["bad"]*9, replace=False)
res.append(df.loc[random_indices])
undersampled_data = pd.concat(res)
full_train, full_test = train_test_split(undersampled_data, test_size=0.2)

# create smaller dataset for grid search
res = []
indices = df.index
random_indices = np.random.choice(indices, 10000, replace=False)
res.append(df.loc[random_indices])
grid_searchable_data = pd.concat(res)
grid_train, grid_test = train_test_split(grid_searchable_data, test_size=0.2)

In [4]:
#print("df size: " + str(df.shape[0]) + ", usdata size: " + str(undersampled_data.shape[0]) + ", gsdata size: " + str(grid_searchable_data.shape[0]))
#print(str(df[df.channel == "cirno_tv"].shape[0]))
#print(str(df[df.channel == "39daph"].shape[0]))
#for channel in undersampled_data.channel.unique():
#    print(channel + ": " + str(undersampled_data[undersampled_data.channel == channel]["status"].value_counts()["bad"]) + " / " + str(undersampled_data[undersampled_data.channel == channel].shape[0]))# + "; total: " + str(df[df.channel == channel]["status"].value_counts()["bad"]) + " / " + str(df[df.channel == channel].shape[0]))

In [5]:
del(df)
del(lines)
del(temp)
del(res)
del(indices)
del(channeltags)

In [6]:
#Vtubers, non-Vtubers, Men, Women, Vtuber men, Vtuber Women, non-Vtuber Men, non-Vtuber Women

vtubers_train = full_train[full_train["Vtuber"] == True]
nonvtubers_train = full_train[full_train["Vtuber"] == False]
men_train = full_train[full_train["Male"] == True]
women_train = full_train[full_train["Male"] == False]
vtubermen_train = full_train[(full_train["Vtuber"] == True) & (full_train["Male"] == True)]
vtuberwomen_train = full_train[(full_train["Vtuber"] == True) & (full_train["Male"] == False)]
nonvtubermen_train = full_train[(full_train["Vtuber"] == False) & (full_train["Male"] == True)]
nonvtuberwomen_train = full_train[(full_train["Vtuber"] == False) & (full_train["Male"] == False)]

vtubers_test = full_test[full_test["Vtuber"] == True]
nonvtubers_test = full_test[full_test["Vtuber"] == False]
men_test = full_test[full_test["Male"] == True]
women_test = full_test[full_test["Male"] == False]
vtubermen_test = full_test[(full_test["Vtuber"] == True) & (full_test["Male"] == True)]
vtuberwomen_test = full_test[(full_test["Vtuber"] == True) & (full_test["Male"] == False)]
nonvtubermen_test = full_test[(full_test["Vtuber"] == False) & (full_test["Male"] == True)]
nonvtuberwomen_test = full_test[(full_test["Vtuber"] == False) & (full_test["Male"] == False)]

In [7]:
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('count_vectorizer', CountVectorizer()), 
                 ('tfidf', TfidfTransformer()),
                 ('bayes', MultinomialNB())])
grid = GridSearchCV(pipe, {
    'count_vectorizer__binary':(True, False),
    'count_vectorizer__ngram_range':((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),
    'bayes__alpha': (0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    'bayes__fit_prior': (True, False)
})
grid.fit(grid_train['message'], grid_train['status'])
grid.predict(grid_test['message'])
MNB_best = grid.best_params_

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(MNB_best)

Best parameter (CV score=1.000):
{'bayes__alpha': 0.01, 'bayes__fit_prior': True, 'count_vectorizer__binary': True, 'count_vectorizer__ngram_range': (1, 1)}


In [8]:
# grid search of linearSVC without 'balanced'
pipe = Pipeline([('count_vectorizer', CountVectorizer()), 
                 ('tfidf', TfidfTransformer()),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('svc', LinearSVC(dual = False, max_iter=200000))])
grid = GridSearchCV(pipe, {
    'count_vectorizer__binary': (True, False),
    'count_vectorizer__ngram_range': ((1, 1), (1, 2), (2, 2)),
    'svc__C': (0.00001, 0.0001)
})

grid.fit(grid_train['message'], grid_train['status'])
predict = grid.predict(grid_test['message'])
LSVC_best = grid.best_params_

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(LSVC_best)

Best parameter (CV score=0.999):
{'count_vectorizer__binary': True, 'count_vectorizer__ngram_range': (1, 1), 'svc__C': 1e-05}


In [9]:
# grid search of linearSVC with 'balanced'
pipe = Pipeline([('count_vectorizer', CountVectorizer()), 
                 ('tfidf', TfidfTransformer()),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('svc', LinearSVC(dual = False, max_iter=2000000, class_weight='balanced'))])
grid = GridSearchCV(pipe, {
    'count_vectorizer__binary':(True, False),
    'count_vectorizer__ngram_range': ((1, 1), (1, 2), (2, 2)),
    'svc__C': (8.9, 9.0, 9.5)
})

grid.fit(grid_train['message'], grid_train['status'])
predict = grid.predict(grid_test['message'])
BLSVC_best = grid.best_params_

print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(BLSVC_best)



Best parameter (CV score=0.999):
{'count_vectorizer__binary': True, 'count_vectorizer__ngram_range': (1, 2), 'svc__C': 8.9}


In [10]:
predictions = {}
predictions2 = {}

In [11]:
# Naive Bayes with full data and selected parameters

pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=MNB_best["count_vectorizer__binary"],
        ngram_range=MNB_best["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=MNB_best["bayes__alpha"],
        fit_prior=MNB_best["bayes__fit_prior"]
    ))
])
pipe.fit(full_train['message'], full_train['status'])

predictions["MultinomialNB trained on full data predicting on full data"] = {"predict": pipe.predict(full_test['message']), "data": full_test}
predictions["MultinomialNB trained on full data predicting on men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}
predictions["MultinomialNB trained on full data predicting on women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}
predictions["MultinomialNB trained on full data predicting on vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}
predictions["MultinomialNB trained on full data predicting on non vtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}
predictions["MultinomialNB trained on full data predicting on vtuber men"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}
predictions["MultinomialNB trained on full data predicting on vtuber women"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}
predictions["MultinomialNB trained on full data predicting on non vtuber men"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}
predictions["MultinomialNB trained on full data predicting on non vtuber women"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}


predictions2["MNB_full_full"] = {"predict": pipe.predict(full_test['message']), "data": full_test}
predictions2["MNB_full_men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}
predictions2["MNB_full_women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}
predictions2["MNB_full_vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}
predictions2["MNB_full_nonvtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}
predictions2["MNB_full_vtubermen"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}
predictions2["MNB_full_vtuberwomen"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}
predictions2["MNB_full_nonvtubermen"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}
predictions2["MNB_full_nonvtuberwomen"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}


In [12]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 20000, dual = False, C = LSVC_best["svc__C"]))
])
pipe.fit(full_train['message'], full_train['status'])

predictions["LinearSVC trained on full data predicting on full data"] = {"predict": pipe.predict(full_test['message']), "data": full_test}
predictions["LinearSVC trained on full data predicting on men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}
predictions["LinearSVC trained on full data predicting on women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}
predictions["LinearSVC trained on full data predicting on vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}
predictions["LinearSVC trained on full data predicting on non vtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}
predictions["LinearSVC trained on full data predicting on vtuber men"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}
predictions["LinearSVC trained on full data predicting on vtuber women"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}
predictions["LinearSVC trained on full data predicting on non vtuber men"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}
predictions["LinearSVC trained on full data predicting on non vtuber women"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}

predictions2["LSVC_full_full"] = {"predict": pipe.predict(full_test['message']), "data": full_test}
predictions2["LSVC_full_men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}
predictions2["LSVC_full_women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}
predictions2["LSVC_full_vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}
predictions2["LSVC_full_nonvtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}
predictions2["LSVC_full_vtubermen"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}
predictions2["LSVC_full_vtuberwomen"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}
predictions2["LSVC_full_nonvtubermen"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}
predictions2["LSVC_full_nonvtuberwomen"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}

In [13]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 20000, dual = False, class_weight='balanced', C = BLSVC_best["svc__C"]))
])
pipe.fit(full_train['message'], full_train['status'])

predictions["Balanced LinearSVC trained on full data predicting on full data"] = {"predict": pipe.predict(full_test['message']), "data": full_test}
predictions["Balanced LinearSVC trained on full data predicting on men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}
predictions["Balanced LinearSVC trained on full data predicting on women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}
predictions["Balanced LinearSVC trained on full data predicting on vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}
predictions["Balanced LinearSVC trained on full data predicting on non vtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}
predictions["Balanced LinearSVC trained on full data predicting on vtuber men"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}
predictions["Balanced LinearSVC trained on full data predicting on vtuber women"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}
predictions["Balanced LinearSVC trained on full data predicting on non vtuber men"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}
predictions["Balanced LinearSVC trained on full data predicting on non vtuber women"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}


predictions2["BLSVC_full_full"] = {"predict": pipe.predict(full_test['message']), "data": full_test}
predictions2["BLSVC_full_men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}
predictions2["bLSVC_full_women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}
predictions2["BLSVC_full_vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}
predictions2["BLSVC_full_nonvtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}
predictions2["BLSVC_full_vtubermen"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}
predictions2["BLSVC_full_vtuberwomen"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}
predictions2["BLSVC_full_nonvtubermen"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}
predictions2["BLSVC_full_nonvtuberwomen"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}

In [14]:
#pipe = Pipeline([
#    ('count_vectorizer', CountVectorizer(
#        binary=True
#    )),
#    ('tfidf', TfidfTransformer()),
#    ('bayes', MultinomialNB(
#        #alpha=grid.best_params_["bayes__alpha"],
#        #fit_prior=grid.best_params_["bayes__fit_prior"]
#    ))
#])
#pipe.fit(vtubers_train['message'], vtubers_train['status'])
#
#predictions["MultinomialNB trained on vtubers predicting on vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}

In [15]:
#

In [16]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=MNB_best["count_vectorizer__binary"],
        ngram_range=MNB_best["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=MNB_best["bayes__alpha"],
        fit_prior=MNB_best["bayes__fit_prior"]
    ))
])
pipe.fit(vtubers_train['message'], vtubers_train['status'])

predictions["MultinomialNB trained on vtubers predicting on vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}

predictions2["MNB_vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}

In [17]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 10000, dual = False, C = LSVC_best["svc__C"]))
])
pipe.fit(vtubers_train['message'], vtubers_train['status'])

predictions["LinearSVC trained on vtubers predicting on vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}

predictions2["LSVC_vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}

In [18]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 15000, dual = False, class_weight='balanced', C = BLSVC_best["svc__C"]))
])
pipe.fit(vtubers_train['message'], vtubers_train['status'])

predictions["Balanced LinearSVC trained on vtubers predicting on vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}

predictions2["BLSVC_vtubers"] = {"predict": pipe.predict(vtubers_test['message']), "data": vtubers_test}

In [19]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=MNB_best["count_vectorizer__binary"],
        ngram_range=MNB_best["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=MNB_best["bayes__alpha"],
        fit_prior=MNB_best["bayes__fit_prior"]
    ))
])
pipe.fit(nonvtubers_train['message'], nonvtubers_train['status'])

predictions["MultinomialNB trained on non vtubers predicting on non vtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}

predictions2["MNB_nonvtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}

In [20]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 10000, dual = False, C = LSVC_best["svc__C"]))
])
pipe.fit(nonvtubers_train['message'], nonvtubers_train['status'])

predictions["LinearSVC trained on non vtubers predicting on non vtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}

predictions2["LSVC_nonvtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}

In [21]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 15000, dual = False, class_weight='balanced', C = BLSVC_best["svc__C"]))
])
pipe.fit(nonvtubers_train['message'], nonvtubers_train['status'])

predictions["Balanced LinearSVC trained on non vtubers predicting on non vtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}

predictions2["BLSVC_nonvtubers"] = {"predict": pipe.predict(nonvtubers_test['message']), "data": nonvtubers_test}

In [22]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=MNB_best["count_vectorizer__binary"],
        ngram_range=MNB_best["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=MNB_best["bayes__alpha"],
        fit_prior=MNB_best["bayes__fit_prior"]
    ))
])
pipe.fit(men_train['message'], men_train['status'])

predictions["MultinomialNB trained on men predicting on men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}

predictions2["MNB_men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}

In [23]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 10000, dual = False, C = LSVC_best["svc__C"]))
])
pipe.fit(men_train['message'], men_train['status'])

predictions["LinearSVC trained on men predicting on men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}

predictions2["LSVC_men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}

In [24]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 15000, dual = False, class_weight='balanced', C = BLSVC_best["svc__C"]))
])
pipe.fit(men_train['message'], men_train['status'])

predictions["Balanced LinearSVC trained on men predicting on men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}

predictions2["BLSVC_men"] = {"predict": pipe.predict(men_test['message']), "data": men_test}

In [25]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=MNB_best["count_vectorizer__binary"],
        ngram_range=MNB_best["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=MNB_best["bayes__alpha"],
        fit_prior=MNB_best["bayes__fit_prior"]
    ))
])
pipe.fit(women_train['message'], women_train['status'])

predictions["MultinomialNB trained on women predicting on women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}

predictions2["MNB_women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}

In [26]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 10000, dual = False, C = LSVC_best["svc__C"]))
])
pipe.fit(women_train['message'], women_train['status'])

predictions["LinearSVC trained on women predicting on women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}

predictions2["LSVC_women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}

In [27]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 15000, dual = False, class_weight='balanced', C = BLSVC_best["svc__C"]))
])
pipe.fit(women_train['message'], women_train['status'])

predictions["Balanced LinearSVC trained on women predicting on women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}

predictions2["BLSVC_women"] = {"predict": pipe.predict(women_test['message']), "data": women_test}

In [28]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=MNB_best["count_vectorizer__binary"],
        ngram_range=MNB_best["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=MNB_best["bayes__alpha"],
        fit_prior=MNB_best["bayes__fit_prior"]
    ))
])
pipe.fit(vtubermen_train['message'], vtubermen_train['status'])

predictions["MultinomialNB trained on vtuber men predicting on vtuber men"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}

predictions2["MNB_vtubermen"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}

In [29]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 30000, dual = False, C = LSVC_best["svc__C"]))
])
pipe.fit(vtubermen_train['message'], vtubermen_train['status'])

predictions["LinearSVC trained on vtuber men predicting on vtuber men"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}

predictions2["LSVC_vtubermen"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}

In [30]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 50000, dual = False, class_weight='balanced', C = BLSVC_best["svc__C"]))
])
pipe.fit(vtubermen_train['message'], vtubermen_train['status'])

predictions["Balanced LinearSVC trained on vtuber men predicting on vtuber men"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}

predictions2["BLSVC_vtubermen"] = {"predict": pipe.predict(vtubermen_test['message']), "data": vtubermen_test}

In [31]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=MNB_best["count_vectorizer__binary"],
        ngram_range=MNB_best["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=MNB_best["bayes__alpha"],
        fit_prior=MNB_best["bayes__fit_prior"]
    ))
])
pipe.fit(vtuberwomen_train['message'], vtuberwomen_train['status'])

predictions["MultinomialNB trained on vtuber women predicting on vtuber women"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}

predictions2["MNB_vtuberwomen"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}

In [32]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 10000, dual = False, C = LSVC_best["svc__C"]))
])
pipe.fit(vtuberwomen_train['message'], vtuberwomen_train['status'])

predictions["LinearSVC trained on vtuber women predicting on vtuber women"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}

predictions2["LSVC_vtuberwomen"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}

In [33]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 15000, dual = False, class_weight='balanced', C = BLSVC_best["svc__C"]))
])
pipe.fit(vtuberwomen_train['message'], vtuberwomen_train['status'])

predictions["Balanced LinearSVC trained on vtuber women predicting on vtuber women"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}

predictions2["BLSVC_vtuberwomen"] = {"predict": pipe.predict(vtuberwomen_test['message']), "data": vtuberwomen_test}

In [34]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=MNB_best["count_vectorizer__binary"],
        ngram_range=MNB_best["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=MNB_best["bayes__alpha"],
        fit_prior=MNB_best["bayes__fit_prior"]
    ))
])
pipe.fit(nonvtubermen_train['message'], nonvtubermen_train['status'])

predictions["MultinomialNB trained on non vtuber men predicting on non vtuber men"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}

predictions2["MNB_nonvtubermen"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}

In [35]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 10000, dual = False, C = LSVC_best["svc__C"]))
])
pipe.fit(nonvtubermen_train['message'], nonvtubermen_train['status'])

predictions["LinearSVC trained on non vtuber men predicting on non vtuber men"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}

predictions2["LSVC_nonvtubermen"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}

In [36]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 15000, dual = False, class_weight='balanced', C = BLSVC_best["svc__C"]))
])
pipe.fit(nonvtubermen_train['message'], nonvtubermen_train['status'])

predictions["Balanced LinearSVC trained on non vtuber men predicting on non vtuber men"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}

predictions2["BLSVC_nonvtubermen"] = {"predict": pipe.predict(nonvtubermen_test['message']), "data": nonvtubermen_test}

In [37]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(
        binary=MNB_best["count_vectorizer__binary"],
        ngram_range=MNB_best["count_vectorizer__ngram_range"]
    )),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB(
        alpha=MNB_best["bayes__alpha"],
        fit_prior=MNB_best["bayes__fit_prior"]
    ))
])
pipe.fit(nonvtuberwomen_train['message'], nonvtuberwomen_train['status'])

predictions["MultinomialNB trained on non vtuber women predicting on non vtuber women"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}

predictions2["MNB_nonvtuberwomen"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}

In [38]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 10000, dual = False, C = LSVC_best["svc__C"]))
])
pipe.fit(nonvtuberwomen_train['message'], nonvtuberwomen_train['status'])

predictions["LinearSVC trained on non vtuber women predicting on non vtuber women"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}

predictions2["LSVC_nonvtuberwomen"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}

In [39]:
pipe = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=BLSVC_best["count_vectorizer__binary"], ngram_range=BLSVC_best["count_vectorizer__ngram_range"])),
    ('tfidf', TfidfTransformer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svc', LinearSVC(max_iter = 15000, dual = False, class_weight='balanced', C = BLSVC_best["svc__C"]))
])
pipe.fit(nonvtuberwomen_train['message'], nonvtuberwomen_train['status'])

predictions["Balanced LinearSVC trained on non vtuber women predicting on non vtuber women"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}

predictions2["BLSVC_nonvtuberwomen"] = {"predict": pipe.predict(nonvtuberwomen_test['message']), "data": nonvtuberwomen_test}

In [74]:
res = []
keys = list(predictions2.keys())
training_sizes = [len(full_train), len(men_train), len(women_train), len(vtubers_train), len(nonvtubers_train),
                  len(vtubermen_train), len(vtuberwomen_train), len(nonvtubermen_train), len(nonvtuberwomen_train)]
test_sizes = [len(full_test), len(men_test), len(women_test), len(vtubers_test), len(nonvtubers_test),
              len(vtubermen_test), len(vtuberwomen_test), len(nonvtubermen_test), len(nonvtuberwomen_test)]

print("full: ", len(full_train))
print("men: ", len(men_train))
print("women: ", len(women_train))
print("vtubers: ", len(vtubers_train))
print("nonvtubers: ", len(nonvtubers_train))
print("vtubermen: ", len(vtubermen_train))
print("vtuberwomen: ", len(vtuberwomen_train))
print("nonvtubermen: ", len(nonvtubermen_train))
print("nonvtuberwomen: ", len(nonvtuberwomen_train))

keys[27], keys[28], keys[29], keys[33], keys[34], keys[35] = keys[33], keys[34], keys[35], keys[27], keys[28], keys[29]
keys[30], keys[31], keys[32], keys[36], keys[37], keys[38] = keys[36], keys[37], keys[38], keys[30], keys[31], keys[32]


count1 = 0
count2 = 3
phase1 = True
for key in keys:
    train_size=0
    test_size=0
    if phase1:
        train_size = training_sizes[0]
        test_size = test_sizes[count1]
        count1 += 1
        if count1 >= len(training_sizes):
            count1 = 0
            count2 -= 1
            if count2 <= 0:
                phase1 = False
                count2 = 3
                count1 = 1
    else:
        train_size = training_sizes[count1]
        test_size = test_sizes[count1]
        count2 -= 1
        if count2 <= 0:
            count1 += 1
            count2 = 3
    predict = predictions2[key]["predict"]
    data = predictions2[key]["data"]
    class_rep = classification_report(data['status'], predict, output_dict = True)
    res.append([class_rep["bad"]["precision"], class_rep["bad"]["recall"], train_size, test_size])

full:  418616
men:  279156
women:  139460
vtubers:  123485
nonvtubers:  295131
vtubermen:  16198
vtuberwomen:  107287
nonvtubermen:  262958
nonvtuberwomen:  32173


In [75]:


res_df = pd.DataFrame(res, keys, ["precision", "recall", "training data", "test data"])
print(res_df)


                           precision    recall  training data  test data
MNB_full_full               0.778139  0.318787         418616     104654
MNB_full_men                0.783439  0.320352         418616      70065
MNB_full_women              0.760870  0.313646         418616      34589
MNB_full_vtubers            0.772014  0.361174         418616      30808
MNB_full_nonvtubers         0.779598  0.310199         418616      73846
MNB_full_vtubermen          0.563636  0.171271         418616       4137
MNB_full_vtuberwomen        0.786822  0.382778         418616      26671
MNB_full_nonvtubermen       0.787168  0.323776         418616      65928
MNB_full_nonvtuberwomen     0.676471  0.186343         418616       7918
LSVC_full_full              0.728707  0.373360         418616     104654
LSVC_full_men               0.736493  0.371946         418616      70065
LSVC_full_women             0.704632  0.378004         418616      34589
LSVC_full_vtubers           0.688688  0.429458     

In [60]:
for key in predictions2:
    print(key)

MNB_full_full
MNB_full_men
MNB_full_women
MNB_full_vtubers
MNB_full_nonvtubers
MNB_full_vtubermen
MNB_full_vtuberwomen
MNB_full_nonvtubermen
MNB_full_nonvtuberwomen
LSVC_full_full
LSVC_full_men
LSVC_full_women
LSVC_full_vtubers
LSVC_full_nonvtubers
LSVC_full_vtubermen
LSVC_full_vtuberwomen
LSVC_full_nonvtubermen
LSVC_full_nonvtuberwomen
BLSVC_full_full
BLSVC_full_men
bLSVC_full_women
BLSVC_full_vtubers
BLSVC_full_nonvtubers
BLSVC_full_vtubermen
BLSVC_full_vtuberwomen
BLSVC_full_nonvtubermen
BLSVC_full_nonvtuberwomen
MNB_vtubers
LSVC_vtubers
BLSVC_vtubers
MNB_nonvtubers
LSVC_nonvtubers
BLSVC_nonvtubers
MNB_men
LSVC_men
BLSVC_men
MNB_women
LSVC_women
BLSVC_women
MNB_vtubermen
LSVC_vtubermen
BLSVC_vtubermen
MNB_vtuberwomen
LSVC_vtuberwomen
BLSVC_vtuberwomen
MNB_nonvtubermen
LSVC_nonvtubermen
BLSVC_nonvtubermen
MNB_nonvtuberwomen
LSVC_nonvtuberwomen
BLSVC_nonvtuberwomen


In [41]:
#def displayresults(key):
#    predict = predictions[key]["predict"]
#    data = predictions[key]["data"]
#    
#    good_bad_count = {"good": 0, "bad": 0}
#    for guess in predict:
#            good_bad_count[guess] += 1
#    print(key)
#    print(good_bad_count)
#    print(classification_report(data['status'], predict))
#    print()

In [42]:
#for key in predictions:
#    displayresults(key)

In [43]:
# fixa så att jag kan fortsätta träna under körning. Testa detta med ett par kanaler som exkluderas från huvudträningen.