In [1]:
# Imports
# Load all necesary dependencies
import spacy
import json
import pandas as pd
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from textblob import TextBlob


In [2]:
# Load Files

# Load both files to be able to process them
with open('train-data-prepared.json', 'r') as f:
  raw_train_statements = json.load(f)
#raw_train_statements=pd.read_json('train-data-prepared.json')
with open('val-data-prepared.json', 'r') as f:
  raw_val_statements = json.load(f)

In [3]:
# Clean text function
nlp = spacy.load("en_core_web_sm")
def clean_text(text: str):
    # Parse the text using the English language model
    # The returned object is an iterator over all tokens
    parsed_text = nlp(text)
    # Initialize a list which will later hold the tokens of the text
    tokenized_clean_text = []
    
    # For each token in the text...
    for token in parsed_text:
        # If the token is _not_ one of the following, append it to
        # the final list of tokens; continue otherwise
        if (not token.is_punct and  # Punctuation
                not token.is_space and  # Whitespace of any kind
                not token.like_url and
                not token.is_stop and 
                token.text != ">" and 
                token.text != "<"
                ): # url
            tokenized_clean_text.append(token.text)
    
    # Return the list of clean tokens for this text
    tokenized_clean_text=' '.join(tokenized_clean_text)
    return tokenized_clean_text


In [4]:
# Create DataFram for Training data
trainDF = pd.DataFrame()

# Store preceding posts, ids, and labels,
trainDF['preceding_posts'] = [statement['preceding_posts'] for statement in raw_train_statements]
trainDF['id'] = [statement['id'] for statement in raw_train_statements]
trainDF['label'] = [statement['label'] for statement in raw_train_statements]

preceding_posts_clean = []

# Iterate over post to extract body and clean text
for statement in trainDF['preceding_posts']:
    body_elements = []
    for body in statement:
        body_elements.append(clean_text(body['body']))
    preceding_posts_clean.append(body_elements)

# Create one single string for all the post in one dialog
trainDF['preceding_posts_body_clean'] = preceding_posts_clean 
trainDF['preceding_posts_body_clean'] = trainDF['preceding_posts_body_clean'].apply(lambda x:" ".join(x))

# Store cleaned words for each post
trainDF['preceding_posts_body_clean_sections'] = preceding_posts_clean 



In [5]:
# Create DataFram for Validation data
validDF = pd.DataFrame()

# Store preceding posts, ids, and labels,
validDF['preceding_posts'] = [statement['preceding_posts'] for statement in raw_val_statements]
validDF['id'] = [statement['id'] for statement in raw_val_statements]
validDF['label'] = [statement['label'] for statement in raw_val_statements]

preceding_posts_clean = []

# Iterate over post to extract body and clean text
for statement in validDF['preceding_posts']:
    body_elements = []
    for body in statement:
        body_elements.append(clean_text(body['body']))
    preceding_posts_clean.append(body_elements)

# Create one single string for all the post in one dialog
validDF['preceding_posts_body_clean'] = preceding_posts_clean 
validDF['preceding_posts_body_clean']=validDF['preceding_posts_body_clean'].apply(lambda x:" ".join(x))

# Store cleaned words for each post
validDF['preceding_posts_body_clean_sections'] = preceding_posts_clean 


In [6]:
# Pre-defined bad words list
bad_words_list = ["4r5e", "5h1t", "5hit", "a55", "anal", "anus", "ar5e", "arrse", "arse", "ass", "ass-fucker", "asses", "assfucker", "assfukka", "asshole", "assholes",  "asswhole", "a_s_s", "b!tch", "b00bs", "b17ch", "b1tch", "ballbag", "balls", "ballsack", "bastard", "beastial", "beastiality", "bellend", "bestial", "bestiality", "bi+ch", "biatch", "bitch", "bitcher", "bitchers", "bitches", "bitchin", "bitching", "bloody", "blow job", "blowjob", "blowjobs", "boiolas", "bollock", "bollok", "boner", "boob", "boobs", "booobs", "boooobs", "booooobs", "booooooobs", "breasts", "buceta", "bugger", "bum", "bunny fucker", "butt", "butthole", "buttmuch", "buttplug", "c0ck", "c0cksucker", "carpet muncher", "cawk", "chink", "cipa", "cl1t", "clit", "clitoris", "clits", "cnut", "cock", "cock-sucker", "cockface", "cockhead", "cockmunch", "cockmuncher", "cocks", "cocksuck", "cocksucked", "cocksucker", "cocksucking", "cocksucks", "cocksuka", "cocksukka", "cok", "cokmuncher", "coksucka", "coon", "cox", "crap", "cum", "cummer", "cumming", "cums", "cumshot", "cunilingus", "cunillingus", "cunnilingus", "cunt", "cuntlick", "cuntlicker", "cuntlicking", "cunts", "cyalis", "cyberfuc", "cyberfuck", "cyberfucked", "cyberfucker", "cyberfuckers", "cyberfucking", "d1ck", "damn", "dick", "dickhead", "dildo", "dildos", "dink", "dinks", "dirsa", "dlck", "dog-fucker", "doggin", "dogging", "donkeyribber", "doosh", "duche", "dyke", "ejaculate", "ejaculated", "ejaculates", "ejaculating", "ejaculatings", "ejaculation", "ejakulate", "f u c k", "f u c k e r", "f4nny", "fag", "fagging", "faggitt", "faggot", "faggs", "fagot", "fagots", "fags", "fanny", "fannyflaps", "fannyfucker", "fanyy", "fatass", "fcuk", "fcuker", "fcuking", "feck", "fecker", "felching", "fellate", "fellatio", "fingerfuck", "fingerfucked", "fingerfucker", "fingerfuckers", "fingerfucking", "fingerfucks", "fistfuck", "fistfucked", "fistfucker", "fistfuckers", "fistfucking", "fistfuckings", "fistfucks", "flange", "fook", "fooker", "fuck", "fucka", "fucked", "fucker", "fuckers", "fuckhead", "fuckheads", "fuckin", "fucking", "fuckings", "fuckingshitmotherfucker", "fuckme", "fucks", "fuckwhit", "fuckwit", "fudge packer", "fudgepacker", "fuk", "fuker", "fukker", "fukkin", "fuks", "fukwhit", "fukwit", "fux", "fux0r", "f_u_c_k", "gangbang", "gangbanged", "gangbangs", "gaylord", "gaysex", "goatse", "God", "god-dam", "god-damned", "goddamn", "goddamned", "hardcoresex", "hell", "heshe", "hoar", "hoare", "hoer", "homo", "hore", "horniest", "horny", "hotsex", "jack-off", "jackoff", "jap", "jerk-off", "jism", "jiz", "jizm", "jizz", "kawk", "knob", "knobead", "knobed", "knobend", "knobhead", "knobjocky", "knobjokey", "kock", "kondum", "kondums", "kum", "kummer", "kumming", "kums", "kunilingus", "l3i+ch", "l3itch", "labia", "lust", "lusting", "m0f0", "m0fo", "m45terbate", "ma5terb8", "ma5terbate", "masochist", "master-bate", "masterb8", "masterbat*", "masterbat3", "masterbate", "masterbation", "masterbations", "masturbate", "mo-fo", "mof0", "mofo", "mothafuck", "mothafucka", "mothafuckas", "mothafuckaz", "mothafucked", "mothafucker", "mothafuckers", "mothafuckin", "mothafucking", "mothafuckings", "mothafucks", "mother fucker", "motherfuck", "motherfucked", "motherfucker", "motherfuckers", "motherfuckin", "motherfucking", "motherfuckings", "motherfuckka", "motherfucks", "muff", "mutha", "muthafecker", "muthafuckker", "muther", "mutherfucker", "n1gga", "n1gger", "nazi", "nigg3r", "nigg4h", "nigga", "niggah", "niggas", "niggaz", "nigger", "niggers", "nob", "nob jokey", "nobhead", "nobjocky", "nobjokey", "numbnuts", "nutsack", "orgasim", "orgasims", "orgasm", "orgasms", "p0rn", "pawn", "pecker", "penis", "penisfucker", "phonesex", "phuck", "phuk", "phuked", "phuking", "phukked", "phukking", "phuks", "phuq", "pigfucker", "pimpis", "piss", "pissed", "pisser", "pissers", "pisses", "pissflaps", "pissin", "pissing", "pissoff", "poop", "porn", "porno", "pornography", "pornos", "prick", "pricks", "pron", "pube", "pusse", "pussi", "pussies", "pussy", "pussys", "rectum", "retard", "rimjaw", "rimming", "s hit", "s.o.b.", "sadist", "schlong", "screwing", "scroat", "scrote", "scrotum", "semen", "sex", "sh!+", "sh!t", "sh1t", "shag", "shagger", "shaggin", "shagging", "shemale", "shi+", "shit", "shitdick", "shite", "shited", "shitey", "shitfuck", "shitfull", "shithead", "shiting", "shitings", "shits", "shitted", "shitter", "shitters", "shitting", "shittings", "shitty", "skank", "slut", "sluts", "smegma", "smut", "snatch", "son-of-a-bitch", "spac", "spunk", "s_h_i_t", "t1tt1e5", "t1tties", "teets", "teez", "testical", "testicle", "tit", "titfuck", "tits", "titt", "tittie5", "tittiefucker", "titties", "tittyfuck", "tittywank", "titwank", "tosser", "turd", "tw4t", "twat", "twathead", "twatty", "twunt", "twunter", "v14gra", "v1gra", "vagina", "viagra", "vulva", "w00se", "wang", "wank", "wanker", "wanky", "whoar", "whore", "willies", "willy", "xrated", "xxx"]

# This will return if the bad words are increasing among the posts
# It will return 1 if this is the case, 0 if not.
def bad_word_counter(group_post):

    bad_words_counter = []
    for post in group_post:
        counter = 0
        
        for token in post.split():
            if (token.lower() in bad_words_list):
                counter = counter + 1
        bad_words_counter.append(counter)
    
    # if negative increased send 1, otherwise send false
    if (len(bad_words_counter) >= 2 ):
        return 1 if bad_words_counter[1] > bad_words_counter[0] else 0.0
    else:
        return 0.0

# This will count upper case words for the dialog
# Avoiding words with length <= 1 and some banned words
def upper_counter(sentence):
    counter = 0
    banned_list = ['TV','EU','LA','A.','I.','OP','SF','OC']
    for token in sentence.split():
        if (token.isupper() and len(token) > 1 and token not in banned_list):
            counter = counter + 1
    return counter 

# This will tell if the dialog has any controversiality
def controversiality_feature(preceding_posts) -> int:
    found = False
    for post_details in preceding_posts:
        if post_details['controversiality']:
            found = True
    return 1 if found else 0.0

# This will tell if the post has any controversiality attribute
# This will return 1 if it has it, 0 if it does not have it.
def violated_rule_feature(preceding_posts) -> int:
    counter = 0
    for post_details in preceding_posts:
        if post_details['violated_rule'] > 0 :
            counter = counter + post_details['violated_rule']
    return counter

# This will provide sentiment if sentiments is negative on the following post it will provide 1
# def sentiment_feature(preceding_posts, label) -> int:
def sentiment_feature(preceding_posts) -> int:
    counter = 0
    neg_records = [0 for _ in range(len(preceding_posts))]
    index = 0
    for post_details in preceding_posts:
        neg_phrase_count = 0

        paragraphs = post_details['body'].split('\n')
        for phrase in paragraphs:
            if(len(phrase) < 1 ):
                continue
            phrase_blob = TextBlob(phrase)
            
            # for this object Sentiment(polarity=-0.6, subjectivity=1.0) we use only polarity 
            sentiment_score = phrase_blob.sentiment[0]
            # If score is negative, count it as negative sentiment.
            if ( sentiment_score < 0.0 ):
                neg_phrase_count = neg_phrase_count + 1

        # Save total of negative polarities on the index post
        neg_records[index] = neg_phrase_count
        index = index + 1

    # if negative increased send 1, otherwise send false
    if (len(neg_records) >= 2 ):
        return 1 if neg_records[1] > neg_records[0] else 0.0
    else:
        return 0.0

In [7]:
# FEATURES SECCION

# Tfidf
v = TfidfVectorizer( ngram_range = (1,1), max_features=300)
v.fit(trainDF['preceding_posts_body_clean'].values.astype('U'))
x_train = v.transform(trainDF['preceding_posts_body_clean'].values.astype('U'))
x_test = v.transform(validDF['preceding_posts_body_clean'].values.astype('U'))

# Get Data Frame to add more features
df_train = pd.DataFrame(x_train.toarray(),columns=v.get_feature_names())
df_val = pd.DataFrame(x_test.toarray(),columns=v.get_feature_names())

# Bad Words
df_train['bad_words'] = trainDF['preceding_posts_body_clean_sections'].apply(bad_word_counter)
df_val['bad_words'] = validDF['preceding_posts_body_clean_sections'].apply(bad_word_counter)

# Sentiment     
df_train['sentiment'] = trainDF['preceding_posts'].apply(sentiment_feature)
df_val['sentiment'] = validDF['preceding_posts'].apply(sentiment_feature)

# UPPER
df_train['upper'] = trainDF['preceding_posts_body_clean'].apply(upper_counter)
df_val['upper'] = validDF['preceding_posts_body_clean'].apply(upper_counter)

# controversiality
df_train['controversiality'] = trainDF['preceding_posts'].apply(controversiality_feature)
df_val['controversiality'] = validDF['preceding_posts'].apply(controversiality_feature)

# violated_rule
df_train['violated_rule'] = trainDF['preceding_posts'].apply(violated_rule_feature)
df_val['violated_rule'] = validDF['preceding_posts'].apply(violated_rule_feature)



In [8]:
# Fit and predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
# fit
clf = LinearSVC().fit(df_train, trainDF["label"])

# predict and save values.
predicted = clf.predict(df_val)




In [9]:
# Score predictions
from sklearn.metrics import precision_score, f1_score, accuracy_score
p = precision_score(validDF["label"], predicted)
f1 = f1_score(validDF["label"], predicted)
a = accuracy_score(validDF["label"], predicted)
print("A={} P={} F1={}".format(a, p,f1))


A=0.6976744186046512 P=0.7073170731707317 F1=0.6904761904761904


In [10]:
# Json output
    # Generates data.json output.

output_dictionary={}
data = validDF['id'].tolist()
predicted_label = [str(x) for x in predicted]

for id , label in zip( data , predicted_label ):
    output_dictionary[id] = label

# This will be tha output file, data.json
with open('data.json', 'w') as json_file:
    json.dump(output_dictionary, json_file)