<a href="https://colab.research.google.com/github/yannick5000/Fake-News-Detector-Covid-19-Vaccine/blob/main/Kopie_von_ML_App_FakeNews_Techlabs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Health-related fakenews detector – Project Work - Machine Learning Group**



Getting started

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/ColabNotebooks/data/

In [4]:
#Loading in data as dataframe
import pandas as pd
import string
import numpy as np

df = pd.read_csv("statements.csv",index_col="Index")

In [5]:
#Removing punctuation from data frame
list_punctuation = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '–', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '®', '„', '“', '≥', '≤', '€']
def remove_punctuation_df(input):
  input = input.replace("[^\w\s]","")
  input = ["".join(letter for letter in word if letter not in string.punctuation)for word in input]
  input = [word.translate(string.punctuation) for word in input]
  input = ["".join(letter for letter in word if letter not in list_punctuation)for word in input]
  return input

In [6]:
df["Statement"] = df[["Statement"]].apply(remove_punctuation_df)

In [7]:
#Removing punctuation from user's sentence
other_punctuation = '–®„“≥≤€'
our_punctuation = string.punctuation + other_punctuation
def remove_punctuation_user(input):
  for character in our_punctuation:
    input = input.replace(character, '')
  return input

In [8]:
example = "Der COVID-19-Impfstoff Spikevax® von Moderna ist jetzt in Europa für die Anwendung bei Kindern ab 6 Jahren zugelassen worden."
example = remove_punctuation_user(example)
print(example)

Der COVID19Impfstoff Spikevax von Moderna ist jetzt in Europa für die Anwendung bei Kindern ab 6 Jahren zugelassen worden


In [9]:
#Setting target variable
df_drop = pd.get_dummies(df.iloc[:,0])
frames = [df_drop,df["True/False"]]
df_final = pd.concat(frames, axis=1)       
df_final 
df_final.tail()  
df_target = df["True/False"].eq(1).groupby(df["Statement"]).mean().reset_index()

In [10]:
df_target

Unnamed: 0,Statement,True/False
0,An COVID würden fast nur diejenigen Menschen ...,0.0
1,Azithromycin kann das CovidVirus abtöten,0.0
2,Bleichmittel zu trinken um das Virus im Körpe...,0.0
3,Chloroquin kann das CovidVirus abtöten,0.0
4,Das Virus wurde künstlich hergestellt und die...,0.0
...,...,...
986,mRNAImpfstoffe können unser Erbgut verändern,0.0
987,mRNAImpfstoffe verändern die DNA im Körper der...,0.0
988,okalreaktionen Schmerzen an der Einstichstelle...,1.0
989,Ähnlich wie bei Erwachsenen ist der Schutz vor...,1.0


Preprocessing

In [11]:
#Preprocessing for dataframe
def preprocess_df(input):
  input = input.apply(lambda x: x.lower())   #lower case
  input = input.apply(lambda x: x.strip())   #remove whitespace left and right
  return input

In [12]:
df["Statement"] = df[["Statement"]].apply(preprocess_df)

In [13]:
#Preprocessing for user's sentence
def preprocess_user(input):
  input = input.lower()   #lower case
  inout = input.strip()   #remove whitespace left and right
  return input

In [14]:
example = preprocess_user(example)
print(example)

der covid19impfstoff spikevax von moderna ist jetzt in europa für die anwendung bei kindern ab 6 jahren zugelassen worden


In [15]:
#Tokenization: bringing sentences into world
import nltk
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [16]:
#Tokenization for dataframe
def tokenize_df(input):
  input = input.apply(lambda x: nltk.word_tokenize(x))
  return input

In [17]:
df["Statement"] = df[["Statement"]].apply(tokenize_df)

In [18]:
#Tokenization for user's sentence
def tokenize_user(input):
  input = nltk.word_tokenize(input)
  return input

In [19]:
example = tokenize_user(example)
print(example)

['der', 'covid19impfstoff', 'spikevax', 'von', 'moderna', 'ist', 'jetzt', 'in', 'europa', 'für', 'die', 'anwendung', 'bei', 'kindern', 'ab', '6', 'jahren', 'zugelassen', 'worden']


In [None]:
#Importing the stop words
#German stop word library from nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

stopWordsGerman = stopwords.words("german")
len(stopWordsGerman)   #232 words

In [21]:
#Removing stop words in dataframe
def remove_stopwords_df(input):
  input = input.apply(lambda element: [word for word in element if not word in stopWordsGerman])
  input = input.apply(lambda x: ' '.join([word for word in x]))   #You can join the list of above words to create a sentence without stop words, as shown below:
  return input

In [22]:
df["Statement"] = df[["Statement"]].apply(remove_stopwords_df)

In [23]:
#Removing stop words in user's sentence
def remove_stopwords_user(input):
  new_list = list()
  for word in input:
    if not word in stopWordsGerman:
      new_list.append(word)
  sentence_withoutstopwords = ' '.join(new_list)
  return sentence_withoutstopwords

In [24]:
example = remove_stopwords_user(example)
print(example)

covid19impfstoff spikevax moderna europa anwendung kindern ab 6 jahren zugelassen worden


In [None]:
#Saving data after preprocessing part 1 as "cleanedstatements.csv"
from google.colab import files
df.to_csv('cleanedstatements.csv')
files.download('cleanedstatements.csv')

Lemmatization

In [None]:
#Lemmatization
#Preparation
!pip install -U spacy                       #to install spacy-lemmatizer
!python -m spacy download de_core_news_sm   #to download the German language module

import spacy
nlp = spacy.load("de_core_news_sm")

nlp.pipe_names   #to define specific lemmas yourself

ar = nlp.get_pipe('attribute_ruler')

#Phrases related to covid
ar.add([[{"TEXT":"vakzinen"}],[{"TEXT":"vakzine"}],[{"TEXT":"vaccination"}],[{"TEXT":"covid19impfung"}],[{"TEXT":"covidimpfung"}],[{"TEXT":"covid19schutzimpfung"}],[{"TEXT":"covidschutzimpfung"}],[{"TEXT":"coronaschutzimpfung"}]],{"LEMMA":"impfung"})
ar.add([[{"TEXT":"impfstoffe"}],[{"TEXT":"impfstoffen"}]],{"LEMMA":"impfstoff"})
ar.add([[{"TEXT":"covid19"}],[{"TEXT":"sarscov2"}]],{"LEMMA":"corona"})
ar.add([[{"TEXT":"coronaerkrankung"}],[{"TEXT":"coronainfektion"}],[{"TEXT":"covid19infektion"}],[{"TEXT":"sarscov2infektion"}]],{"LEMMA":"corona"})
ar.add([[{"TEXT":"covid19virus"}],[{"TEXT":"sarscov2virus"}], [{"TEXT":"coronaviren"}]], {"LEMMA":"coronavirus"})
ar.add([[{"TEXT":"covid19impfstoff"}],[{"TEXT":"covidimpfstoffe"}],[{"TEXT":"covidimpfstoffe"}]],{"LEMMA":"coronaimpfstoff"})
ar.add([[{"TEXT":"covid19auffrischimpfung"}],[{"TEXT":"covidauffrischimpfung"}],[{"TEXT":"coronaauffrischimpfung"}],[{"TEXT":"auffrischungsimpfung"}],[{"TEXT":"covid19auffrischungsimpfung"}],[{"TEXT":"covidiauffrischungsimpfung"}],[{"TEXT":"coronaauffrischungsimpfung"}]],{"LEMMA":"auffrischimpfung"})

#Other phrases
ar.add([[{"TEXT":"altersgruppe"}]],{"LEMMA":"altersklasse"})
ar.add([[{"TEXT":"myokarditis"}],[{"TEXT":"myokarditiden"}]],{"LEMMA":"herzmuskelentzündung"})
ar.add([[{"TEXT":"todesfälle"}],[{"TEXT":"sterbefall"}],[{"TEXT":"sterbefälle"}],[{"TEXT":"sterberate"}],[{"TEXT":"todesrate"}]],{"LEMMA":"todesfall"})
ar.add([[{"TEXT":"fehlgeburten"}],[{"TEXT":"spontanabort"}],[{"TEXT":"fehlgeburt"}],[{"TEXT":"stillgeburt"}],[{"TEXT":"totgeburt"}]],{"LEMMA":"fehlgeburt"})


In [27]:
df = pd.read_csv("cleanedstatements.csv", index_col='Index')

In [28]:
def lemmatize_df(input):
  lemmtext = []
  doc = nlp(input)
  input = [x.lemma_ for x in doc]
  lemmtext.append(input)
  input = ' '.join([word for word in input])
  input = input.lower()   #Through lemmatization, words with capital letters reappeared, so make everything lower case again
  return input

In [29]:
df["Statement"] = df["Statement"].apply(lemmatize_df)

In [30]:
def lemmatize_user(input):
  lemmtext = []
  doc = nlp(input)
  input = [x.lemma_ for x in doc]
  lemmtext.append(input)
  input = ' '.join(input)
  input = input.lower()   #Through lemmatization, words with capital letters reappeared, so make everything lower case again
  return input

In [31]:
example = lemmatize_user(example)
print(example)

coronaimpfstoff spikevax moderna europa anwendung kind ab 6 jahr zulassen werden


In [None]:
#Saving data after lemmatization part 1 as "lemmatizedstatements.csv"
from google.colab import files
df.to_csv('lemmatizedstatements.csv')
files.download('lemmatizedstatements.csv')

Machine Learning Model: Logistic Regression

In [33]:
#Preparing fake news detector: training model with data

In [None]:
#Following code is from: https://www.kaggle.com/code/shahkan/text-classification-using-logistic-regression/data
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sqlite3 import Error
from sklearn.ensemble import RandomForestClassifier
import sqlite3
import pickle
import nltk
nltk.download('stopwords')
%matplotlib inline

In [35]:
df = pd.read_csv('lemmatizedstatements.csv')

In [36]:
#Stopwords are the same as above
vectorizer = TfidfVectorizer(min_df= 3, stop_words=stopWordsGerman, sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(df['Statement']).toarray()
final_features.shape

(999, 899)

In [37]:
#Alternative Logistic Regression model with code from: https://www.analyticsvidhya.com/blog/2021/04/beginners-guide-to-logistic-regression-using-python/ and': https://www.kaggle.com/code/shahkan/text-classification-using-logistic-regression/notebook

from sklearn.linear_model import LogisticRegression
X = df['Statement']   #input
Y = df['True/False']  #output
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=600)),
                     ('clf', LogisticRegression(random_state=42))])
model = pipeline.fit(X_train, y_train)
predictions = model.predict(X_test)
probability = model.predict_proba(X)
print(probability)   #left column of matrix: probability of predicted output being 0, right column: probability of predicted output being 1; rows: per each of our statements

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, 
predictions))

[[0.64113865 0.35886135]
 [0.38188611 0.61811389]
 [0.84097471 0.15902529]
 ...
 [0.23272025 0.76727975]
 [0.72531773 0.27468227]
 [0.74294346 0.25705654]]
              precision    recall  f1-score   support

       False       0.81      0.81      0.81       149
        True       0.81      0.81      0.81       151

    accuracy                           0.81       300
   macro avg       0.81      0.81      0.81       300
weighted avg       0.81      0.81      0.81       300

[[120  29]
 [ 28 123]]


Creating Fake News Detector with Anvil

In [None]:
!pip install -Uqq fastai --upgrade
!pip install -Uqq fastcore --upgrade
!pip install -Uqq pandas==1.1.0
!pip install -Uqq anvil-uplink

In [39]:
from fastai.text.all import load_learner
import anvil.server

In [None]:
anvil.server.connect("TKEAZQWDWYAZVEHF4EW3TN5F-XGIH7W2J3XZSQW46")

In [41]:
@anvil.server.callable
def predict_app(input):
  input = remove_punctuation_user(input)   #the different kinds of preprocessing steps also have to be applied to our user's input sentence - if not, the machine learning model will not understand the sentence and will not be able to categorize it into True or False
  input = preprocess_user(input)
  input = tokenize_user(input)
  input = remove_stopwords_user(input)
  input = lemmatize_user(input)
  input = [input]
  prediction = model.predict(input)
  correct = "The statement is correct"
  false = "The statement is false"
  if prediction == 0:
    preds = correct
  else:
    preds = false
  return preds

In [42]:
#For demo:
#URL to our application: https://XGIH7W2J3XZSQW46.anvil.app/3KGXEUN7XDU6WSMTXUMIEWAZ
  #Correct statement #1: "mRNA-Impfstoffe schützen vor schweren Verläufen."
  #Correct statement #2: "Die COVID-19-Impfstoffe schützen gut vor COVID-19."
  #Correct statement #3: "Es sind keine Mikrochips in Impfstoffen enthalten."
  #False statement #1: "Von Coronaimpfstoffen geht eine hohe Todesgefahr aus."
  #False statement #2: "Chloroquin kann das Covid-Virus abtöten. Impfungen sind unnötig."
  #False statement #3: "Personen mit besonders schwerem Verlauf weisen viele Antikörper gegen SARS-CoV-2 auf."