In [2]:
from collections import Counter
import emoji
import math
from nltk.corpus import stopwords
import numpy as np
import os
import pandas as pd
from pprint import pprint
from random import randint
import re
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, matthews_corrcoef, precision_score, 
                             precision_recall_fscore_support, recall_score, roc_auc_score)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import statistics as st

In [3]:
###############################
# generate file paths to data #
###############################

hydrated_tweet_folder = "data"
tweet_ids_folder = "data"

path_5g_json = "5g_corona_conspiracy.json"
path_other_json = "other_conspiracy.json"
path_non_consp_json = "non_conspiracy.json"
path_test_json = "test_tweets.json"
path_test_ids_txt = "test_tweet_ids.json"

path_5g = os.path.join(hydrated_tweet_folder, path_5g_json)
path_other = os.path.join(hydrated_tweet_folder, path_other_json)
path_non = os.path.join(hydrated_tweet_folder, path_non_consp_json)
path_test = os.path.join(hydrated_tweet_folder, path_test_json)
path_test_ids = os.path.join(tweet_ids_folder, path_test_ids_txt)

assert(os.path.isfile(path_5g))
assert(os.path.isfile(path_other))
assert(os.path.isfile(path_non))
assert(os.path.isfile(path_test))
assert(os.path.isfile(path_test_ids))

In [4]:
################
# read in data #
################

fiveg_df = pd.read_json(path_5g)
other_df = pd.read_json(path_other)
nocon_df = pd.read_json(path_non)
test_df = pd.read_json(path_test)

# we will need to submit predictions for all tweet ids
# test_ids_df = pd.read_csv(path_test_ids, names=['id'])
test_ids_df = pd.read_json(path_test_ids)
test_ids_df.rename(columns = {0: 'id'}, inplace = True)

test_id_set = set(test_ids_df['id'])
retreived_test_set = set(test_df['id'])

# find missing tweets from test set
missing_test_tweets = test_id_set.difference(retreived_test_set)

# mark as real tweets, because we're going to add fake tweets later
fiveg_df['actual_tweet'] = True
other_df['actual_tweet'] = True
nocon_df['actual_tweet'] = True

In [5]:
####################
# train eval split #
####################

train_ratio = 0.8

def mark_train(df, train_ratio=0.8, test_ids=None):
    
    if test_ids:
        df['test'] = df.apply(lambda row:(str(row['id']) in test_ids) and row['actual_tweet'], axis=1)
    else:
        df['test'] = df.apply(lambda row: (randint(1,100) > int(train_ratio*100) and row['actual_tweet']), axis=1)            
        
    return df

fiveg_df = mark_train(fiveg_df, train_ratio=train_ratio)
other_df = mark_train(other_df, train_ratio=train_ratio)
nocon_df = mark_train(nocon_df, train_ratio=train_ratio)

In [6]:
####################
# label and concat #
####################

fiveg_df['label'] = 1
other_df['label'] = 2
nocon_df['label'] = 3

print(f"\n{'train':>17} {'test':>12} {'train pct':>15}\n")

def display_ratio(df, name):
    eval_df = df[df['test']==True]
    train_df = df[df['test']==False]
    
    print(f'{name}: {len(train_df):>10,} {len(eval_df):>12,} {len(train_df)/len(df):>14.2f}%')
    
    return train_df, eval_df

fiveg_train_df, _ = display_ratio(fiveg_df, 'FIVEG')
other_train_df, _ = display_ratio(other_df, 'OTHER')
nocon_train_df, _ = display_ratio(nocon_df, 'NOCON')

df = pd.concat([fiveg_df, other_df, nocon_df])

train_df, eval_df = display_ratio(df, 'TOTAL')

X_train = train_df['full_text']
y_train = train_df['label']

X_eval = eval_df['full_text']
y_eval = eval_df['label']


            train         test       train pct

FIVEG:        868          252           0.78%
OTHER:        529          159           0.77%
NOCON:      3,295          843           0.80%
TOTAL:      4,692        1,254           0.79%


In [7]:
no_test = False
if no_test :
    X_train = X_train.append(X_eval)
    y_train = y_train.append(y_eval)

In [8]:
#################
# preprocessing #
#################

class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.re_prog_url = re.compile(r'https://t.co/([a-zA-Z0-9]+)')
    
    def fit( self, X, y=None ):
        return self 
    
    def _process(self, text):
        
        urls = self.re_prog_url.findall(text)
        text = text.lower()\
                .replace('https://t.co/', '')\
                .replace('u.s.', 'us')\
                .replace('u.k.', 'uk')
        for url in urls:
            text = text.replace(url.lower(), 'url')

        return text
    
    def transform(self, X, y=None):
        
        X = X.apply(self._process)
        
        return X

In [9]:
############
# pipeline #
############

classifier = LogisticRegression(
    C=0.9,
    class_weight={
        1: 0.4,
        2: 0.4,
        3: 0.2
    },
    multi_class= 'ovr',
    max_iter=2000,
    solver= 'saga'
)

vectorizer = CountVectorizer(
    strip_accents='unicode',
)

pipeline = Pipeline(
    [
        ('preprocessor', Preprocessor()),
        ('vectorizer', vectorizer),
        ('classifier', classifier)
    ]
)

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_eval)
probabilities = pipeline.predict_proba(X_eval)

accuracy = accuracy_score(y_eval, predictions)*100
precision = precision_score(y_eval, predictions, zero_division=0, average="macro")*100
recall = recall_score(y_eval, predictions, average="macro")*100
f1 = f1_score(y_eval, predictions, average="macro")*100
support = precision_recall_fscore_support(y_eval, predictions, average="macro")
matthews = matthews_corrcoef(y_eval, predictions)*100

header = classifier.__class__.__name__

print(f'\n{header}\n\nAccuracy  Precision  Recall   F1       MCC')
print(f'{accuracy:.2f}%{precision:>9.2f}%{recall:>10.2f}%{f1:>8.2f}%{matthews:>8.2f}%\n')

##############
# submission #
##############

predictions = pipeline.predict(test_df['full_text'])
probabilities = pipeline.predict_proba(test_df['full_text'])

filename = './output/ME20FND_DL-TXST_001a.txt'
if no_test:
    filename = './output/ME20FND_DL-TXST_001b.txt'

threshold = 0.10
    
with open(filename,'w') as f:
    for tweet_id, prediction, prob in zip(test_df['id'], predictions, probabilities):
        
        diff = prob[prediction-1]-st.median(prob)
        if diff < threshold:
            prediction = 0
        
        f.write(f'{tweet_id},{prediction}\n')
    for tweet_id in missing_test_tweets:
        f.write(f'{tweet_id},-1\n')


LogisticRegression

Accuracy  Precision  Recall   F1       MCC
74.16%    59.04%     53.50%   54.49%   42.38%



In [None]:

                                        #####################
                                        # stop here for now #
                                        #####################
            

In [10]:
m = {1:'FIVEG',2:'OTHER',3:'NOCON'}

def print_top100(vectorizer, clf, class_labels):    
    """Prints features with the highest coefficient values, per class"""
    
    feature_names = vectorizer.get_feature_names()
    
    for i, class_label in enumerate(class_labels):
        top100 = np.argsort(clf.coef_[i])[-100:]
        print("%s:\n\n%s\n" % (m[class_label],
              "\n".join(feature_names[j] for j in top100)))
        
print_top100(vectorizer, classifier, [1,2,3])

FIVEG:

boss
victim
fetched
2009
play
60ghz
provide
humanity
up
shall
activated
warned
follow
areas
chemicals
popping
zoonotic
mention
night
looking
globally
power
hemoglobin
keep
mandatory
deprivation
demolition
cough
spots
controlled
outs
susceptible
5gtowers
surrounding
function
david
disease
harms
chemtrails
video
lying
asking
investigating
whistleblower
worse
map
antennas
out
ain
sheeple
population
destroy
project
cv
structure
5gkills
distract
radiation
waves
nwo
weapon
agenda
realdonaldtrump
distancing
flu
order
question
prove
alter
high
everywhere
21
body
knew
chris
connection
place
dying
former
billgates
emf
electromagnetic
oxygen
link
tell
coincidence
illness
research
city
microwave
watch
turned
depopulation
rolled
5g
kill
share
immune
wuhan
symptoms

OTHER:

sounds
rfid
totally
assertions
hysteria
children
but
lets
diversion
usually
chinese
came
22
doctors
real
communication
wake
abuja
manufactures
shocking
censoring
down
diagnosis
weather
2017
health
combating
fuck
say
discu