In [1]:
from collections import Counter
import emoji
import math
from nltk.corpus import stopwords
import numpy as np
import os
import pandas as pd
from pprint import pprint
from random import randint
import re
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, matthews_corrcoef, precision_score, 
                             precision_recall_fscore_support, recall_score, roc_auc_score)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import statistics as st

In [2]:
###############################
# generate file paths to data #
###############################

hydrated_tweet_folder = "data"
tweet_ids_folder = "data"

path_5g_json = "5g_corona_conspiracy.json"
path_other_json = "other_conspiracy.json"
path_non_consp_json = "non_conspiracy.json"
path_test_json = "test_tweets.json"
path_test_ids_txt = "test_tweet_ids.json"

path_5g = os.path.join(hydrated_tweet_folder, path_5g_json)
path_other = os.path.join(hydrated_tweet_folder, path_other_json)
path_non = os.path.join(hydrated_tweet_folder, path_non_consp_json)
path_test = os.path.join(hydrated_tweet_folder, path_test_json)
path_test_ids = os.path.join(tweet_ids_folder, path_test_ids_txt)

assert(os.path.isfile(path_5g))
assert(os.path.isfile(path_other))
assert(os.path.isfile(path_non))
assert(os.path.isfile(path_test))
assert(os.path.isfile(path_test_ids))

In [3]:
################
# read in data #
################

fiveg_df = pd.read_json(path_5g)
other_df = pd.read_json(path_other)
nocon_df = pd.read_json(path_non)
test_df = pd.read_json(path_test)

# we will need to submit predictions for all tweet ids
# test_ids_df = pd.read_csv(path_test_ids, names=['id'])
test_ids_df = pd.read_json(path_test_ids)
test_ids_df.rename(columns = {0: 'id'}, inplace = True)

test_id_set = set(test_ids_df['id'])
retreived_test_set = set(test_df['id'])

# find missing tweets from test set
missing_test_tweets = test_id_set.difference(retreived_test_set)

# mark as real tweets, because we're going to add fake tweets later
fiveg_df['actual_tweet'] = True
other_df['actual_tweet'] = True
nocon_df['actual_tweet'] = True

In [4]:
#set(test_df['id']).intersection(set(df['id']))
#set(df['id']).intersection(set(test_df['id']))
print(len(set(test_df['id']).difference(test_id_set)))
print(len(set(test_id_set).difference(test_df['id'])))

0
263


In [5]:
####################
# train eval split #
####################

train_ratio = 0.8

def mark_train(df, train_ratio=0.8, test_ids=None):
    
    if test_ids:
        df['test'] = df.apply(lambda row:(str(row['id']) in test_ids) and row['actual_tweet'], axis=1)
    else:
        df['test'] = df.apply(lambda row: (randint(1,100) > int(train_ratio*100) and row['actual_tweet']), axis=1)            
        
    return df

fiveg_df = mark_train(fiveg_df, train_ratio=train_ratio)
other_df = mark_train(other_df, train_ratio=train_ratio)
nocon_df = mark_train(nocon_df, train_ratio=train_ratio)

In [6]:
####################
# label and concat #
####################

fiveg_df['label'] = 1
other_df['label'] = 0
nocon_df['label'] = 0

print(f"\n{'train':>17} {'test':>12} {'train pct':>15}\n")

def display_ratio(df, name):
    eval_df = df[df['test']==True]
    train_df = df[df['test']==False]
    
    print(f'{name}: {len(train_df):>10,} {len(eval_df):>12,} {len(train_df)/len(df):>14.2f}%')
    
    return train_df, eval_df

fiveg_train_df, _ = display_ratio(fiveg_df, 'FIVEG')
other_train_df, _ = display_ratio(other_df, 'OTHER')
nocon_train_df, _ = display_ratio(nocon_df, 'NOCON')

df = pd.concat([fiveg_df, other_df, nocon_df])

train_df, eval_df = display_ratio(df, 'TOTAL')

############
# join ocr #
############

fiveg_ocr = pd.read_csv('output/image_terms_fiveg.csv')
nocon_ocr = pd.read_csv('output/image_terms_nocon.csv')
other_ocr = pd.read_csv('output/image_terms_other.csv')
test_ocr = pd.read_csv('output/image_terms_test.csv')

ocr_df = pd.concat([
    fiveg_ocr,
    nocon_ocr,
    other_ocr,
    test_ocr
])

def add_ocr(row):
    if 'media' in row.entities:
        x = ocr_df[ocr_df['filename'] == str(row.id)+'.png']
        
        try:
            terms = x['terms'].iloc[0]
            if type(terms) == str:
                row['full_text'] += terms
        except:
            pass
        
    return row['full_text']

train_df['full_text'] = train_df.apply(add_ocr, axis=1)
eval_df['full_text'] = eval_df.apply(add_ocr, axis=1)
test_df['full_text'] = test_df.apply(add_ocr, axis=1)

########################
# prepare for training #
########################

X_train = train_df['full_text']
y_train = train_df['label']

X_eval = eval_df['full_text']
y_eval = eval_df['label']


            train         test       train pct

FIVEG:        901          219           0.80%
OTHER:        552          136           0.80%
NOCON:      3,328          810           0.80%
TOTAL:      4,781        1,165           0.80%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['full_text'] = train_df.apply(add_ocr, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['full_text'] = eval_df.apply(add_ocr, axis=1)


In [7]:
no_test = False
if no_test :
    X_train = X_train.append(X_eval)
    y_train = y_train.append(y_eval)

In [8]:
#################
# preprocessing #
#################

class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.re_prog_url = re.compile(r'https://t.co/([a-zA-Z0-9]+)')
    
    def fit( self, X, y=None ):
        return self 
    
    def _process(self, text):
        
        urls = self.re_prog_url.findall(text)
        text = text.lower()\
                .replace('https://t.co/', '')\
                .replace('u.s.', 'us')\
                .replace('u.k.', 'uk')\
                .replace('5 g', '5g')
        for url in urls:
            text = text.replace(url.lower(), 'url')

        return text
    
    def transform(self, X, y=None):
        
        X = X.apply(self._process)
        
        return X

In [9]:
############
# pipeline #
############

class_weights={
    0: 0.4,
    1: 0.6
}

classifier = LogisticRegression(
    # C=0.9,
    class_weight=class_weights,
    # multi_class= 'ovr',
    max_iter=2000,
    solver= 'saga'
)

vectorizer = CountVectorizer(
    strip_accents='unicode'
)

pipeline = Pipeline(
    [
        ('preprocessor', Preprocessor()),
        ('vectorizer', vectorizer),
        ('classifier', classifier)
    ]
)

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_eval)
probabilities = pipeline.predict_proba(X_eval)

accuracy = accuracy_score(y_eval, predictions)*100
precision = precision_score(y_eval, predictions, zero_division=0, average="macro")*100
recall = recall_score(y_eval, predictions, average="macro")*100
f1 = f1_score(y_eval, predictions, average="macro")*100
support = precision_recall_fscore_support(y_eval, predictions, average="macro")
matthews = matthews_corrcoef(y_eval, predictions)*100

header = classifier.__class__.__name__

print(f'\n{header}\n\nAccuracy  Precision  Recall   F1       MCC')
print(f'{accuracy:.2f}%{precision:>9.2f}%{recall:>10.2f}%{f1:>8.2f}%{matthews:>8.2f}%\n')

##############
# submission #
##############

predictions = pipeline.predict(test_df['full_text'])
probabilities = pipeline.predict_proba(test_df['full_text'])

filename = os.path.join('output','ME20FND_DL-TXST_012.txt')
if no_test:
    filename = os.path.join('output','ME20FND_DL-TXST_012b.txt')

with open(filename,'w') as f:
    for tweet_id, prediction, prob in zip(test_df['id'], predictions, probabilities):        
        f.write(f'{tweet_id},{prediction}\n')
    for tweet_id in missing_test_tweets:
        f.write(f'{tweet_id},-1\n')


LogisticRegression

Accuracy  Precision  Recall   F1       MCC
97.51%    96.46%     95.31%   95.87%   91.76%



In [None]:

                                        #####################
                                        # stop here for now #
                                        #####################
            

In [167]:
# m = {1:'FIVEG',2:'OTHER',3:'NOCON'}
m = {0:'OTHER', 1:'FIVEG'}

def print_top100(vectorizer, clf, class_labels):    
    """Prints features with the highest coefficient values, per class"""
    
    feature_names = vectorizer.get_feature_names()
    
    for i, class_label in enumerate(class_labels):
        top100 = np.argsort(clf.coef_[i])[-100:]
        print("%s:\n\n%s\n" % (m[class_label],
              "\n".join(feature_names[j] for j in top100)))
        
print_top100(vectorizer, classifier, [0])

FIVEG:

everywhere
design
map
out
killing
cursing
therefore
harms
28
place
popping
function
whistleblower
spots
loss
alongside
areas
#coronavirus
structure
linked
boss
coverage
testing
discussing
🤦🏽‍♂️
putting
issues
nwo
mentions
flying
created
die
know
@worldtruthtv
aka
turned
project
illness
cv
microwave
frequency
former
chemtrails
ig
connected
cells
version
distancing
kill
#5gtowers
weapon
share
towers
liberty
video
okay
#5gkills
exposure
correlation
london
ain’t
govt
destroy
watch
radiation
21
used
night
side
depopulation
alter
connection
david
#covid
lot
order
play
dying
flu
hemoglobin
city
tell
activating
agenda
emf
electromagnetic
truth
research
link
rolled
body
chris
question
#5g
coincidence
oxygen
immune
5g
symptoms
wuhan

OTHER:

evil
sacrifices
ostensibly
tracked
hold
played
predicted
ppl
chip
confident
ireland
got
eye
kids
right
they're
war
main
digital
censoring
000
frequencies
mike
#qanon
2017
down
lie
steele
studies
funny
getting
military
gates
being
responsible
effects
