In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls

drive  sample_data


In [None]:
!python -m spacy download en_core_web_lg #after running that cell restart the runtime and run everything but this cell

In [1]:
%cd drive/MyDrive/DataMining/Data_mining/src/models

[Errno 2] No such file or directory: 'drive/MyDrive/DataMining/Data_mining/src/models'
/content


In [None]:
currentdir =  "/content/drive/MyDrive/DataMining"

In [None]:
!pip install ray
!pip install datasets
!pip install demoji

In [None]:
from sklearn import preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss
import pickle
from collections import defaultdict
from torch.optim import AdamW
import itertools
import ray
from sklearn.utils import resample
import sklearn.model_selection as ms
from sklearn.metrics import classification_report
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
import csv
from sklearn.metrics import f1_score, accuracy_score
import os.path
from tqdm import tqdm
from functools import partial
import psutil
import numpy as np
import pandas
from ast import literal_eval
import sys
import os
import copy

'\nmodule_path = os.path.abspath(os.path.join(\'..\'))\nmodule_path = os.path.join(module_path, "data")\nif module_path not in sys.path:\n    sys.path.append(module_path)\nprint(sys.path)\nimport preprocessing\n'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
ray._private.utils.get_system_memory = lambda: psutil.virtual_memory().total

## ***Basic Neural Network Model***

In [None]:
class Net(nn.Module):

    def __init__(self, input_size, num_layers, max_layer_size, drop_out):
        super(Net, self).__init__()
        self.input_size = input_size
        self.num_layers = num_layers
        self.max_layer_size = max_layer_size
        in_size = max_layer_size

        self.layers = nn.ModuleList([nn.Linear(input_size, in_size)])
        for layer in range(1, num_layers-1):
            self.layers.append(nn.Linear(in_size, int(in_size/2)))
            in_size = int(in_size/2)
        self.layers.append(nn.Linear(in_size, 1))
        self.drop_out = nn.Dropout(drop_out)


    def forward(self, x):
        for layer in self.layers[:-1]:
            x = F.relu(self.drop_out(layer(x)))
        x = self.layers[-1](x)
        x = torch.sigmoid(x)
        return x

In [None]:
class NetEnsemble(nn.Module):
    #INFERENCE ONLY!
    def __init__(self, nets):
        super(NetEnsemble, self).__init__()
        self.nets = nn.ModuleList(nets)
    def forward(self, x):
        outputs = []
        for net in self.nets:
            outputs.append(net(x))
        outputs = torch.concat(outputs, dim=1)
        outputs = torch.mean(outputs, dim=1)     
        return outputs

## ***DataSet Loader***

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import math
import numpy as np

def transform_reduce(transformer, reducer, data):
        result = reducer.transform(transformer.transform(data.values.tolist()))
        #result = transformer.transform(data.values.tolist()).toarray() #Only 600 words in vocabulary... reduction not really necessary ..
        return pd.Series(list(result)).values

def tfidf_transform_data(source_column, target_column, df_train, df_test, vectorizer, compress_to=300):

        #Fit SVD to training data
        tfidf_train_data = vectorizer.transform(df_train[source_column].values.tolist())
        svd = TruncatedSVD(n_components=compress_to)
        svd.fit(tfidf_train_data)

        #Transform Data:
        df_train[target_column] = transform_reduce(vectorizer, svd, df_train[source_column])
        df_test[target_column] = transform_reduce(vectorizer, svd, df_test[source_column])

        return df_train, df_test

In [None]:
import string
from xmlrpc.client import Boolean
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np
from datasets import load_dataset
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.model_selection as ms
from sklearn.utils import resample
import demoji
import re
import spacy
from typing import Tuple

demoji.download_codes()

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#!python -m spacy download en_core_web_lg

def setup(rem_stop=True, do_stem=True, do_lem=False, split=True, upsample=True, do_emojis=True):
    df = load_data()

    df['preprocessed'] = preprocess(
        df['tweet'], rem_stop=rem_stop, do_stem=do_stem, do_lem=do_lem, do_emojis=do_emojis)

    if split is True:
        df_train, df_test = split_data(df)
        tfidf = train_tfidf(df_train['preprocessed'])
        if upsample is True:
            df_train = upsampling(df_train)
        return tfidf, df_train, df_test
    else:
        tfidf = train_tfidf(df['preprocessed'])
        return tfidf, df


def load_data():
    dataset = load_dataset("tweets_hate_speech_detection")
    df = pd.DataFrame.from_dict(dataset['train'])
    return df


def preprocess(data, rem_stop=True, do_stem=True, do_lem=False, do_emojis=True):
    assert do_stem != do_lem
    preprocessed = []
    for tweet in data:
        if do_emojis is True:
            tweet, _ = convert_emoji(tweet)
        tokens = tokenization(remove_punctuation(tweet))
        if rem_stop is True:
            tokens = remove_stopwords(tokens)
        if do_stem is True and do_lem is False:
            tokens = stemming(tokens)
        if do_lem is True and do_stem is False:
            tokens = lemmatization(tokens)
        preprocessed.append(np.array(tokens))

    return preprocessed


def train_tfidf(data):
    def dummy(text):
        return text

    tf = TfidfVectorizer(
        analyzer='word',
        tokenizer=dummy,
        preprocessor=dummy,
        token_pattern=None)

    return tf.fit(data)


def split_data(df: pd.DataFrame, test_size=0.2, random_state=17):

    df_train, df_test = ms.train_test_split(df, test_size=test_size, random_state=random_state, stratify=df["label"])

    return df_train, df_test


def upsampling(df: pd.DataFrame, replace=True, random_state=55):
    data_minority = df[df.label == 1]
    data_majority = df[df.label == 0]
    data_minority = resample(
        data_minority, replace=replace, n_samples=len(data_majority), random_state=random_state)

    return pd.concat([data_majority, data_minority])


def tokenization(text: str):
    return pd.Series(nltk.word_tokenize(text.lower()))


def remove_punctuation(tokens: pd.Series):
    return "".join([i for i in tokens if i not in punctuation])


def remove_stopwords(tokens: pd.Series):
    stopwords_list = stopwords.words("english")
    return tokens.apply(lambda token: token if token not in stopwords_list and token != '' else None).dropna()


def stemming(tokens: pd.Series):
    stemmer = PorterStemmer()

    return tokens.apply(lambda token: stemmer.stem(token))


def lemmatization(tokens: pd.Series):
    lemmatizer = WordNetLemmatizer()

    return tokens.apply(lambda token: lemmatizer.lemmatize(token))


def convert_emoji(text: str):
    # convert string to binary representation
    binary = ' '.join(format(ord(x), 'b') for x in text)

    # convert binary representation to utf8 representation
    listRes = list(binary.split(" "))
    try:
        text_with_emoji = bytes([int(x, 2) for x in listRes]).decode('utf-8')
    except UnicodeDecodeError:
        return text, []

    # get all emojis
    dictionary = demoji.findall(text_with_emoji)

    # replace emojis with text representation
    emojis = []
    for key in dictionary.keys():
        if key in text_with_emoji: emojis.append(dictionary[key])
        text_with_emoji = text_with_emoji.replace(key, dictionary[key] + " ")

    return text_with_emoji, emojis

def emb_data(data):
    nlp = spacy.load("en_core_web_lg") #If you are using colab and this buggs out: Restart runtime but DO NOT install the "en_core_web_lg" again.
    tweets = data.values.tolist()
    nlp.disable_pipes("parser", "ner") #remove pipe we do not need
    embeddings = [sum([word.vector for word in item])/len(item) for item in nlp.pipe(tweets)] #Takes some time...
    return pd.Series(embeddings).values

def get_features(df: pd.DataFrame):
    df["n_mentions"] = df["tweet"].apply(lambda x: count_user_mentions(x))
    df["hashtags"] = df["tweet"].apply(lambda x: identify_hashtags(x))
    df["emojis"] = df["tweet"].apply(lambda x: convert_emoji(x)[1])
    df["emb"] = emb_data(df["tweet"])
    return df

def count_user_mentions(text:str) ->int:
    return text.count("@user")

def identify_hashtags(text:str) -> list:
    pattern = re.compile(r"#(\w+)")
    return pattern.findall(text)






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
from torch.utils import data
class HatespeechDataset(Dataset):
    #def __init__(self, dataset="train", rep="emb", emojis=False, hashtags=False):
    def __init__(self, dataset, rep="emb", emojis=False, hashtags=False):
    
        features = ["id", "label"]
        features.append(rep)
        if emojis: features.append("tfidf_emojis")
        if hashtags: features.append("tfidf_hashtags")
       
        self.data = dataset[features].values

        cleaned = []
        for dp in self.data:
            point = []
            point.extend(dp[:2])
            point.extend([np.zeros(self.data[0][i+2].shape) if type(d)==float else np.array(d, dtype=float) for i, d in enumerate(dp[2:])])
            cleaned.append(point)

        self.data = cleaned


    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        ids = self.data[idx][0]
        reps = list(self.data[idx][2:])
        label = self.data[idx][1]
        return ids, reps, label


In [None]:
TF_IDF, DF_TRAIN, DF_TEST = setup(upsample=False)
DF_TRAIN = get_features(DF_TRAIN)
DF_TEST = get_features(DF_TEST)
DF_TRAIN, DF_TEST = tfidf_transform_data("preprocessed", "tfidf", DF_TRAIN, DF_TEST, TF_IDF)
DF_TRAIN, DF_TEST = tfidf_transform_data("hashtags", "tfidf_hashtags", DF_TRAIN, DF_TEST, TF_IDF, compress_to=100)
DF_TRAIN, DF_TEST = tfidf_transform_data("emojis", "tfidf_emojis", DF_TRAIN, DF_TEST, TF_IDF, compress_to=50)
DF_TRAIN["id"] = DF_TRAIN.index
DF_TEST["id"] = DF_TEST.index

Downloading builder script:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/881 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset tweets_hate_speech_detection/default (download: 2.96 MiB, generated: 3.04 MiB, post-processed: Unknown size, total: 6.00 MiB) to /root/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2...


Downloading data:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31962 [00:00<?, ? examples/s]

Dataset tweets_hate_speech_detection downloaded and prepared to /root/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

## ***Hyperparameter Optimisation***

In [None]:
K = 5

kfd = ms.StratifiedKFold(K)

train_sets = []
dev_sets = []
for train_set, dev_set in kfd.split(DF_TRAIN, DF_TRAIN.label):
    train_sets.append(train_set)
    dev_sets.append(dev_set)
print([np.mean(DF_TRAIN.iloc[subset]["label"].values) for subset in train_sets])
print([np.mean(DF_TRAIN.iloc[subset]["label"].values) for subset in dev_sets])
print(np.mean(DF_TEST["label"].values))
train_sets = [HatespeechDataset(dataset=upsampling(DF_TRAIN.iloc[subset]), rep="emb", emojis=False, hashtags=True) for subset in train_sets]
dev_sets = [HatespeechDataset(dataset=DF_TRAIN.iloc[subset], rep="emb", emojis=False, hashtags=True) for subset in dev_sets]

vector_size = sum([vec.shape[-1] for vec in train_sets[0].data[0][2:]])

test_dataset = HatespeechDataset(dataset=DF_TEST, rep="emb", emojis=False, hashtags=True)

checkpoint_dir = currentdir + "/checkpoints"
data_dir = currentdir + "/Data_mining/data/"

config = {
    "lr": tune.loguniform(1e-3, 1e-1),
    "batch_size": tune.choice([8, 16, 32, 64]),
    "num_layers": tune.choice([2, 3, 4]),
    "drop_out" : tune.loguniform(0.1, 0.8),
    "max_layer_size": tune.choice(list(range(200, 800, 100))),
}

scheduler = ASHAScheduler(
    metric="f1",
    mode="max",
    max_t=10000, #No time restrictions
    grace_period=4, 
    reduction_factor=2) 

reporter = CLIReporter(
    parameter_columns=["lr", "batch_size", "num_layers", "drop_out", "max_layer_size"],
    metric_columns=["f1", "training_iteration"])

result = tune.run(
        #tune.with_parameters(train, checkpoint_dir=checkpoint_dir, train_dataset = train_dataset, vector_size=vector_size, K=K),
        tune.with_parameters(train, checkpoint_dir=checkpoint_dir, train_sets = train_sets, dev_sets = dev_sets, vector_size=vector_size, K=K),
        resources_per_trial={"cpu": 1, "gpu": 1},
        config=config,
        num_samples=15, 
        scheduler=scheduler,
        progress_reporter=reporter)
    
best_trial = result.get_best_trial("f1", "max", "all")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation: {}".format(
    best_trial.last_result["f1"]))

#best_trained_model = Net(vector_size, best_trial.config["num_layers"], best_trial.config["max_layer_size"], best_trial.config["drop_out"])
nets = [Net(vector_size, best_trial.config["num_layers"], best_trial.config["max_layer_size"], best_trial.config["drop_out"]) for i in range(K)]
best_trained_model = NetEnsemble(nets)    

best_trained_model.to(device)

best_checkpoint_dir = best_trial.checkpoint.value
model_state = torch.load(os.path.join(
    best_checkpoint_dir, "checkpoint"))
torch.save(model_state, './nn.model')
torch.save(best_trial.config, './nn.config')

best_trained_model.load_state_dict(model_state)
test(best_trained_model, best_trial.config, test_dataset = test_dataset)


In [None]:
K=5
test_dataset = HatespeechDataset(dataset=DF_TEST, rep="emb", emojis=False, hashtags=True)
vector_size = sum([vec.shape[-1] for vec in test_dataset.data[0][2:]])

best_trained_model_state = torch.load("./nn.model")
best_trial_config = torch.load("./nn.config")
nets = [Net(vector_size, best_trial_config["num_layers"], best_trial_config["max_layer_size"], best_trial_config["drop_out"]) for i in range(K)]
best_trained_model = NetEnsemble(nets)
best_trained_model.load_state_dict(best_trained_model_state)
best_trained_model = best_trained_model.to(device)
label_list, preds, id_list = test(best_trained_model, best_trial_config, test_dataset = test_dataset)

  del sys.path[0]


[31897, 23894, 28409, 3203, 17994, 22185, 30945, 3225, 12242, 25182, 9724, 8877, 6891, 8127, 16993, 11772, 15077, 28402, 19151, 28300, 14185, 14967, 16165, 10892, 20142, 5945, 2770, 19120, 2776, 9198, 10092, 16744, 31709, 21628, 5530, 28827, 17081, 20189, 29862, 24399, 20710, 26026, 4192, 15818, 24205, 15821, 17201, 21670, 1292, 7313, 18175, 21081, 5333, 14749, 23046, 7999, 10690, 10236, 21540, 30571, 17700, 5471, 20610, 27192, 3772, 31083, 6210, 18894, 13162, 14455, 17826, 22641, 17476, 15722, 31431, 26170, 3064, 19567, 29435, 13712, 19106, 19444, 21351, 8467, 1091, 24730, 15806, 27354, 13175, 189, 15521, 7337, 14849, 4249, 21828, 24117, 5214, 4623, 1520, 30213, 26944, 28132, 20713, 26629, 26351, 25161, 24463, 16643, 16400, 283, 7178, 20324, 30245, 23889, 7190, 5808, 29070, 17324, 31177, 17744, 23324, 156, 25966, 15919, 27467, 2753, 5083, 16088, 3056, 24953, 12742, 13079, 10077, 10131, 23198, 16366, 14843, 30512, 10150, 16798, 7958, 6302, 30587, 2710, 6057, 26417, 190, 27796, 4554, 29

## ***Train***

In [None]:
def train(config, checkpoint_dir, train_sets, dev_sets, vector_size, K):
    net = Net(vector_size, config["num_layers"], config["max_layer_size"], config["drop_out"]).to(device)

    criterion = nn.BCELoss() 
    optimizer = AdamW(net.parameters(), lr=config["lr"])

    nets = [Net(vector_size, config["num_layers"], config["max_layer_size"], config["drop_out"]).to(device) for i in range(K)]
    optimizers = [AdamW(net.parameters(), lr=config["lr"]) for net in nets]

    criterion = nn.BCELoss() 
   
    for epoch in range(12):  # loop over the dataset multiple times
        eval_score = 0
        for k in range(K):
            net = nets[k]#.to(device)
            optimizer = optimizers[k]
            train_dataloader = DataLoader(train_sets[k], batch_size=config["batch_size"], shuffle=True)
            print("LEN TRAIN: ", len(train_dataloader))
            dev_dataloader = DataLoader(dev_sets[k], batch_size=config["batch_size"], shuffle=True)
            print("LEN DEV: ", len(dev_dataloader))

            net.train()
            running_loss = 0.0
            for i, data in enumerate(tqdm(train_dataloader)):
                ids, tfidfs, labels = data
                tfidfs = torch.concat((tfidfs), dim=1).to(device)
                labels = torch.tensor(labels).to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                outputs = net(tfidfs.float())

                loss = criterion(outputs.squeeze(), labels.float())
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                if i % 2000 == 1999:
                    print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                    running_loss = 0.0
            net.eval()

            preds = []
            label_list = []
            with torch.no_grad():
                for i, data in enumerate(dev_dataloader):
                    ids, tfidfs, labels = data
                    tfidfs = torch.concat((tfidfs), dim=1).to(device)
                    labels = torch.tensor(labels).to(device)

                    outputs = net(tfidfs.float())
                    outputs = outputs.squeeze()

                    outputs = outputs.tolist()
                    labels = labels.tolist()

                    if type(outputs) is float:
                        preds.append(bool(round(outputs)))
                    else:
                        preds.extend([bool(round(output)) for output in outputs])

                    if type(labels) is float:
                        label_list.append(labels)
                    else:
                        label_list.extend([label for label in labels])
            print("SUBMODEL F1: ", f1_score(label_list, preds))
            eval_score += f1_score(label_list, preds)/K
     
        print("ENSEMBLE F1: ", eval_score)
        tune.report(f1=eval_score)
        
        ensemble = NetEnsemble(nets)   
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((ensemble.state_dict()), path)



## ***Test***

In [None]:
def test(net, config, test_dataset):

    test_dataloader = DataLoader(test_dataset, batch_size=config["batch_size"])

    net.eval()
    id_list = []
    preds = []
    label_list = []
    with torch.no_grad():
        for i, data in enumerate(test_dataloader):
            ids, tfidfs, labels = data
            tfidfs = torch.concat((tfidfs), dim=1).to(device)
            labels = torch.tensor(labels).to(device)

            outputs = net(tfidfs.float())
            outputs = outputs.squeeze()

            outputs = outputs.tolist()
            labels = labels.tolist()
            ids = ids.tolist()            
            if type(outputs) is float:
                preds.append(bool(round(outputs)))
            else:
                preds.extend([bool(round(output)) for output in outputs])

            if type(labels) is float:
                label_list.append(labels)
            else:
                label_list.extend([label for label in labels])
            if type(ids) is float:
                id_list.append(ids)
            else:
                id_list.extend([id for id in ids])
    print(id_list)
        
    f1 = f1_score(label_list, preds)
    print("TEST SET F1 SCORE:", f1)
    acc = accuracy_score(label_list, preds)
    print("TEST SET ACCURACY:", acc)

    print(classification_report(label_list, preds, digits=3))

    return label_list, preds, id_list


#Error Analyse

In [None]:
def bias(word):
    a = DF_TRAIN[['label','tweet']].values.tolist()
    a = [item for item in a if word in item[1]]
    try:
        return sum([item[0] for item in a])/len(a)
    except Exception:
        return None


FPs = []
for i in range(len(preds)):
    if label_list[i] == 0 and preds[i] == 1:
        FPs.append(DF_TEST.loc[id_list[i],["tweet", "preprocessed"]])

TPs = []
for i in range(len(preds)):
    if label_list[i] == 1 and preds[i] == 1:
        TPs.append(DF_TEST.loc[id_list[i],["tweet", "preprocessed"]])


FNs = []
for i in range(len(preds)):
    if label_list[i] == 1 and preds[i] == 0:
        FNs.append(DF_TEST.loc[id_list[i],["tweet", "preprocessed"]])

TNs = []
for i in range(len(preds)):
    if label_list[i] == 0 and preds[i] == 0:
        TNs.append(DF_TEST.loc[id_list[i],["tweet", "preprocessed"]])

fp_tokens = []
print("\n\n\nFALSE POSITIVES:\n")
for fp in FPs:
    print("========FP=======")            
    print(fp["tweet"])
    fp_tokens.extend(set(fp["preprocessed"].tolist()))

tp_tokens = []
for tp in TPs:
    tp_tokens.extend(set(tp["preprocessed"].tolist()))

a = sorted([[item, fp_tokens.count(item)-tp_tokens.count(item)] for item in set(fp_tokens)], key=lambda x: x[1], reverse=True)
print([[item[0], item[1], bias(item[0])] for item in a[:100]])


fn_tokens = []
print("\n\n\nFALSE NEGATIVES:\n")
for fn in FNs:
    print("========FN=======")
    print(fn["tweet"])
    fn_tokens.extend(set(fn["preprocessed"].tolist()))


tn_tokens = []
for tn in TNs:
    tn_tokens.extend(set(tn["preprocessed"].tolist()))


a = sorted([[item, fn_tokens.count(item)-tn_tokens.count(item)] for item in set(fn_tokens)], key=lambda x: x[1], reverse=True)
print([[item[0], item[1], bias(item[0])] for item in a[:100]])







FALSE POSITIVES:

bihday sexy girl anal granny movie  
 @user #schools should produce  , healthy and productive people. forget #exams, #handwriting and facts. @user @user #fantâ¦
a good old fashioned public shaming is in order. 
@user @user @user  from #bbuk have same outlook on a relationship  
@user plz don't u dare forget @user majority @user vote against #gunsense bills 4 @user long $$! @user @user 
#orlandonightclubshooting disgusting   #shocked 
@user @user @user i find this rather disappointing, liked the black suit.  
@user woke 2horrific orlando news.i don't give a fuck if you r straight or gay orâ¤ï¸goblins with blu hair. no 1 deserves this #orlaâ¦
xbox one reverses drm! angry rant pt.3 #video #xbox #one #reverses #drm   #rant #sverigesweden
father lying through his ass. #terrorism #homophobia #orlando #killings   #bad #evil #religitards #lie  
please note this story is fake! (read the comments) "africans are lazy good at sex theft" - #hillarydesperation   
slime invas

In [None]:
a = DF_TRAIN[['label','tweet']].values.tolist()
a = [item for item in a if 'trump' in item[1]]
print(sum([item[0] for item in a])/len(a))
#a.loc[DF_TRAIN['label'] == 0]
#a.loc[DF_TRAIN['label'] == 1]

0.515625
