In [2]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import shuffle
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score
from StopFAIke import data

# Loading data

In [6]:
df = pd.read_csv(f"{os.path.join(os.path.dirname(os.getcwd()),'raw_data', 'poynter_final_condensed.csv')}", index_col='Unnamed: 0')

df.head()

Unnamed: 0,title_list,label_list_transformed
0,There is a call for free vaccination in the Ci...,1.0
1,Video of a girl being forcibly vaccinated agai...,1.0
2,The US CDC has admitted an error in the PCR te...,1.0
3,Children are 50 times more likely to die from ...,1.0
4,Those vaccinated against the COVID-19 cannot d...,1.0


In [9]:
df['label_list_transformed'].value_counts()

0.0    7458
1.0    7439
Name: label_list_transformed, dtype: int64

# Preparing data for modelling

In [17]:
df['lemmatized'] = df['title_list'].apply(data.clean)

df.head()

Unnamed: 0,title_list,label_list_transformed,lemmatized
0,There is a call for free vaccination in the Ci...,1.0,"[call, free, vaccination, ciudad, de, la, luz,..."
1,Video of a girl being forcibly vaccinated agai...,1.0,"[video, girl, forcibly, vaccinated, covid]"
2,The US CDC has admitted an error in the PCR te...,1.0,"[u, cdc, admitted, error, pcr, test, called, w..."
3,Children are 50 times more likely to die from ...,1.0,"[child, time, likely, die, covid, vaccine, vir..."
4,Those vaccinated against the COVID-19 cannot d...,1.0,"[vaccinated, covid, donate, blood]"


In [22]:
df['lemmatized_joined'] = df['lemmatized'].apply(lambda x: ' '.join(word for word in x))

df.head()

Unnamed: 0,title_list,label_list_transformed,lemmatized,lemmatized_joined
0,There is a call for free vaccination in the Ci...,1.0,"[call, free, vaccination, ciudad, de, la, luz,...",call free vaccination ciudad de la luz center ...
1,Video of a girl being forcibly vaccinated agai...,1.0,"[video, girl, forcibly, vaccinated, covid]",video girl forcibly vaccinated covid
2,The US CDC has admitted an error in the PCR te...,1.0,"[u, cdc, admitted, error, pcr, test, called, w...",u cdc admitted error pcr test called withdrawa...
3,Children are 50 times more likely to die from ...,1.0,"[child, time, likely, die, covid, vaccine, vir...",child time likely die covid vaccine virus acco...
4,Those vaccinated against the COVID-19 cannot d...,1.0,"[vaccinated, covid, donate, blood]",vaccinated covid donate blood


In [30]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df['lemmatized_joined'])

In [54]:
y = df['label_list_transformed']

In [56]:
assert X.shape[0] == len(y)

# Creating base model

In [58]:
X, y = shuffle(X, y, random_state= 42)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [62]:
model = MultinomialNB()

cv_results = cross_validate(model, X_train, y_train, scoring='accuracy', cv=5)

cv_results

{'fit_time': array([0.01755738, 0.00938797, 0.0053854 , 0.0044229 , 0.00483227]),
 'score_time': array([0.00464892, 0.00087571, 0.00316405, 0.000983  , 0.00085521]),
 'test_score': array([0.78571429, 0.76605944, 0.77410072, 0.76306954, 0.78513189])}

In [63]:
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.778076062639821

In [65]:
preds = model.predict(X_test)

print(accuracy_score(y_test, preds))
print(recall_score(y_test, preds))
print(precision_score(y_test, preds))

0.778076062639821
0.741003998223012
0.8030813673567646


# Checking vectorizer's accuracy

In [36]:
vectorizer.get_feature_names()

['aa',
 'aaj',
 'aap',
 'aardvark',
 'aardvarkmobiletours',
 'aarhus',
 'aarogya',
 'aaron',
 'ab',
 'abandoned',
 'abbott',
 'abbreviation',
 'abc',
 'abdel',
 'abdo',
 'abdomen',
 'abdominal',
 'abdullah',
 'abe',
 'abeyasinghe',
 'abhigya',
 'abide',
 'ability',
 'abillity',
 'abitibi',
 'ablation',
 'able',
 'abnormal',
 'abnormality',
 'abolished',
 'abolishes',
 'abolition',
 'aboout',
 'aborted',
 'abortion',
 'abou',
 'abril',
 'abroad',
 'abrogated',
 'absence',
 'absolute',
 'absolutelly',
 'absolutely',
 'absorb',
 'absorbed',
 'absorption',
 'abstain',
 'absurd',
 'abtracetogether',
 'abuse',
 'abused',
 'abusing',
 'abusive',
 'ac',
 'academic',
 'academy',
 'accelerate',
 'accelerated',
 'accelerates',
 'acceleration',
 'accept',
 'acceptability',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'access',
 'accessed',
 'accessible',
 'accessing',
 'accessory',
 'accident',
 'accidental',
 'accidentally',
 'accompanied',
 'accompanies',
 'accompany',
 'accompanyin

In [44]:
start_string = ''

for item in df.lemmatized_joined:
    start_string = start_string + ' ' + item

{' ',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'ß',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'ç',
 'è',
 'é',
 'ë',
 'í',
 'î',
 'ï',
 'ñ',
 'ó',
 'ô',
 'ö',
 'ú',
 'ü',
 'ğ',
 'ı',
 'ł',
 'œ',
 'ş',
 'š',
 'а',
 'в',
 'е',
 'и',
 'к',
 'л',
 'о',
 'р',
 'с',
 'т',
 'у',
 'х',
 'ь',
 'ю'}

In [48]:
start_string = start_string.strip()

unique_words = set(start_string.split(sep= ' '))

unique_words

{'cemetery',
 'veil',
 'vinegar',
 'garling',
 'gamaleya',
 'dispersing',
 'gear',
 'antioquia',
 'balela',
 'fuellmich',
 'legislative',
 'banner',
 'concludes',
 'deemed',
 'ambulance',
 'recovery',
 'adenine',
 'careful',
 'madness',
 'aquino',
 'rehashed',
 'rarely',
 'favorite',
 'anxious',
 'thestonkmarket',
 'quaratine',
 'convenience',
 'rican',
 'isabel',
 'photo',
 'clarify',
 'hung',
 'infiltrator',
 'migrant',
 'zinc',
 'liked',
 'adequately',
 'abolishes',
 'dismantled',
 'charged',
 'bended',
 'clusted',
 'newborn',
 'jihad',
 'azithromycin',
 'inspection',
 'escaped',
 'analogy',
 'communicated',
 'dna',
 'heart',
 'mention',
 'guillén',
 'expanding',
 'lula',
 'drug',
 'natal',
 'unversity',
 'happen',
 'unicenter',
 'powerlessness',
 'eudravigillance',
 'latvia',
 'busted',
 'applicant',
 'passenger',
 'psoriasis',
 'extolling',
 'lose',
 'snipping',
 'serf',
 'inefficacy',
 'build',
 'install',
 'seychelles',
 'earky',
 'fasting',
 'exiting',
 'infectious',
 'spliced'

In [49]:
len(unique_words)

13145

In [52]:
'aaj' in unique_words

True