In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from IPython.display import clear_output

# Read Data and Clean

In [2]:
df = pd.read_csv('../data/scraped data/league_discord.csv', index_col='Unnamed: 0')

In [3]:
df

Unnamed: 0,name,content,timestamp
0,Ziul#2536,<:wutmo:513395072025493505>,2020-11-01T20:36:44.881000+00:00
1,_ItzChris_#7028,<:dogeKek:639726958091173910>,2020-11-01T20:36:59.732000+00:00
2,HeapsOfHamood#3085,https://cdn.discordapp.com/attachments/3660512...,2020-11-01T20:37:00.621000+00:00
3,_ItzChris_#7028,Lmfao,2020-11-01T20:37:09.857000+00:00
4,Ziul#2536,"bruh, the best teemo player from my country is...",2020-11-01T20:37:41.427000+00:00
...,...,...,...
4035,consolidation/p&L/ratio/tax#7495,<@!411507772094414850> https://media.discordap...,2020-10-01T06:13:04.420000+00:00
4036,Strike345#5659,should i pick viktor or ryze?,2020-10-01T06:13:10.040000+00:00
4037,Strike345#5659,i picked viktor <:WEIRD:665202887009501192>,2020-10-01T06:13:25.493000+00:00
4038,Digitally#2636,I mostly get 10/8 and still lose something lik...,2020-10-01T06:05:48.131000+00:00


In [4]:
df = df.loc[:, ['content']]

In [5]:
df = df.drop_duplicates()

In [6]:
# df = df.reindex(columns=df.columns.to_list() + ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [7]:
# Get rid of Emotes for Discord Messages
df.loc[:, 'content'] = df['content'].str.replace(r'<.*>', '')

# Get rid of urls
df.loc[:, 'content'] = df['content'].str.replace(r'http\S+', '')

# Get Rid of New Line and Strip
df.loc[:, 'content'] = df['content'].str.replace(r'\n', '').str.strip()

In [8]:
# Get rid of empty Strings
df = df[df['content'] != ""]

In [9]:
df = df.reset_index(drop=True)

# Make Predictions Using Logistic Regression

## Setup

In [10]:
# Create list of outputs required
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Vectorizer

In [11]:
# Read Vectorizer
filename = 'log_reg_vectorizer.sav'
word_vectorizer = pickle.load(open('../output/' + filename, 'rb'))

## Make Predictions

In [12]:
def get_predictions(model, threshold, x_test, class_name):
    # Make Predictions
    predictions = model.predict_proba(x_test)[:, 1]
    
    # Add Column with Predictions
    df[class_name] = np.where(predictions > threshold, 1, 0)

In [13]:
x_test = word_vectorizer.transform(df['content'])
for class_name in classes:
    filename = 'log_reg_' + class_name + '.sav'
    model, threshold = pickle.load(open('../output/' + filename, 'rb'))
    threshold = 0.8 #override optimal threshold as we care more about false positives
    
    get_predictions(model, threshold, x_test, class_name)
    

In [14]:
df.head()

Unnamed: 0,content,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,real faker tweet,0,0,0,0,0,0
1,Lmfao,0,0,0,0,0,0
2,"bruh, the best teemo player from my country is...",0,0,0,0,0,0
3,me and nunu tiered 3 simped for adc,0,0,0,0,0,0
4,yeah but i think since seraphine was released ...,0,0,0,0,0,0


# Convert to CSV

In [16]:
df.to_csv('../data/predicted data/predicted_league_discord.csv')