#### Imports

In [None]:
import os
import logging
from dotenv import find_dotenv, load_dotenv
import pandas as pd
from newspaper import Article


#### Scrape GermanFakeNC Titles and Bodies
This uses the GermanFakeNC and crawls through its entries to receive title and body from each samples URL

In [10]:
# Functions
def extract_title(url):
    article = Article(url)
    try:
        article.download()
        logger.info('Article title downloaded from %s' % url)
        article.parse()
    except:
        article.title = 'No title'

    return article.title


def extract_text(url):
    article = Article(url)
    try:
        article.download()
        logger.info('Article text downloaded from %s' % url)
        article.parse()
    except:
        article.text = 'No text'

    return article.text


log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger()

# Load .env file
load_dotenv(find_dotenv())

# INPUTFILE = os.path.join(os.getenv('PROJECT_DIR'),
#                          'data',
#                          'raw',
#                          'GermanFakeNC.json')

INTERIMFILE = os.path.join('GermanFakeNC_interim_titel_or_text_existing.csv')

df = pd.read_json("germanfakenc.json")
logger.info('Head of dataframe: \n%s' % df.head())

# %% We only take News with an overall rating of at least 0.5
#overall_rating_mask = df['Overall_Rating'] >= 0.5
##ratio_mask = df['Ratio_of_Fake_Statements'].isin([3, 4])
#df_fake = df[overall_rating_mask & ratio_mask].reset_index()

df['titel'] = df['URL'].apply(extract_title)
df['text'] = df['URL'].apply(extract_text)

logger.info('Head of dataframe after parsing: \n%s' % df.head())

# Create mask to filter rows with no information (titel or text)
no_info_mask = (df['titel'] != 'No title') & (df['text'] != 'No text')
df_final = df[no_info_mask]

logger.info('Shape of final dataframe: %s' % str(df_final.shape))
logger.info('dtypes: \n%s' % str(df_final.dtypes))
logger.info('Rows with null values: \n%s' % df_final.isnull().sum())

# Save as csv
try:
    df_final.to_csv(INTERIMFILE, index=False)
    logger.info("CSV was saved to disk")
except Exception:
    logger.exception("Couldn't save CSV to disc \n", exc_info=True)

2021-12-07 08:24:36,220 - root - INFO - Head of dataframe: 
        Date                                                URL  \
0 2017-08-30  https://schluesselkindblog.com/2017/08/30/proz...   
1 2017-12-18  http://blauerbote.com/2017/12/18/bild-journali...   
2 2017-06-02  http://blauerbote.com/2017/06/02/angela-merkel...   
3 2017-09-25  http://smopo.ch/deutschlands-neonazis-waehlen-...   
4 2018-02-17  http://www.truth24.net/gruppenvergewaltigung-s...   

  False_Statement_1_Location False_Statement_1_Index  \
0                       Text                 213-237   
1                       Text                   13-36   
2                      Title                     1-7   
3                      Title                     1-5   
4                      Title                     1-1   

  False_Statement_2_Location False_Statement_2_Index  \
0                                                      
1                       Text                   52-81   
2                       Text    

2021-12-07 08:25:11,218 - root - INFO - Article title downloaded from https://opposition24.com/schamlos-antidemokratisch-unionsfraktion-verteidigt-netzdg/392739
2021-12-07 08:25:11,449 - root - INFO - Article title downloaded from https://philosophia-perennis.com/2018/01/29/kandel-demo/
2021-12-07 08:25:11,745 - root - INFO - Article title downloaded from https://philosophia-perennis.com/2017/10/24/anti-afd-demo/
2021-12-07 08:25:12,671 - root - INFO - Article title downloaded from http://www.truth24.net/armutsmigrant-onaniert-vor-junger-frau-mitten-in-deren-wohnheim-essen/
2021-12-07 08:25:13,477 - root - INFO - Article title downloaded from http://noch.info/2017/11/merkel-co-versuchen-mit-allen-moeglichen-maassnahmen-an-der-macht-zu-bleiben/
2021-12-07 08:25:14,925 - root - INFO - Article title downloaded from http://www.noislam.de/bundesarbeitsagentur-faelscht-statistik-62-aller-hartz-iv-empfaenger-als-nicht-arbeitslos-ausgewiesen/
2021-12-07 08:25:15,399 - root - INFO - Article tit

2021-12-07 08:26:00,255 - root - INFO - Article title downloaded from http://noch.info/2017/10/tabuthema-die-fluechtlingskosten-in-deutschland-sind-astronomisch-hoch/
2021-12-07 08:26:02,143 - root - INFO - Article title downloaded from http://zuerst.de/2017/12/15/auslaendergewalt-einzelfaelle-von-mord-ueber-versuchten-toschlag-bis-hin-zu-massenschlaegerei/
2021-12-07 08:26:02,990 - root - INFO - Article title downloaded from http://www.anonymousnews.ru/2017/10/31/merkel-regime-im-jahr-1849-war-in-deutschland-mehr-meinungsfreiheit-garantiert-als-heute/
2021-12-07 08:26:03,091 - root - INFO - Article title downloaded from http://smopo.ch/rassismus-eklat-bei-nato-uebung/
2021-12-07 08:26:03,098 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/hund-rettet-oma-60-bei-ueberfall-durch-schwarzhaeutigen-fluechtling-vergewaltigung-vereitelt/
2021-12-07 08:26:03,100 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/rentnerin-vs-refugee-kruecke-als

2021-12-07 08:26:29,400 - root - INFO - Article title downloaded from https://www.compact-online.de/forscher-rechnen-volksaustausch-vor-75-millionen-muslime-in-eu-bis-2050-grenzschliessung-zwecklos/
2021-12-07 08:26:29,649 - root - INFO - Article title downloaded from https://philosophia-perennis.com/2017/10/24/deutschland-zensur/
2021-12-07 08:26:29,786 - root - INFO - Article title downloaded from http://smopo.ch/juncker-plant-ehrung-fuer-hitler/
2021-12-07 08:26:31,014 - root - INFO - Article title downloaded from http://www.journalistenwatch.com/2017/12/13/abgeschobener-afghane-landet-am-donnerstag-wieder-in-deutschland/
2021-12-07 08:26:31,890 - root - INFO - Article title downloaded from http://www.truth24.net/migrantenhorde-lockt-maedchen-mit-drogen-und-versucht-es-zu-vergewaltigen/
2021-12-07 08:26:32,928 - root - INFO - Article title downloaded from http://www.rapefugees.net/wirtschaftsfluechtling-vergewaltigt-frau-wirft-sie-aus-fenster-und-vergeht-sich-an-kind-bensheim-hessen

2021-12-07 08:27:01,868 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/genozid-wieder-obdachloser-durch-fluechtlinge-gesteinigt-und-lebendig-begraben/
2021-12-07 08:27:01,923 - root - INFO - Article title downloaded from http://smopo.ch/mossul-fuer-frauen-sicherer-als-stockholm/
2021-12-07 08:27:01,928 - root - INFO - Article title downloaded from http://www.noislam.de/sex-dschihad-in-schweden-frauen-werden-beim-joggen-von-bewaffneten-polizisten-eskortiert/
2021-12-07 08:27:02,918 - root - INFO - Article title downloaded from https://dieunbestechlichen.com/2018/02/spd-bundestagsabgeordnete-und-schmarotzer-freund-karl-lauterbach-verhoehnt-deutsche-arbeiter/
2021-12-07 08:27:04,688 - root - INFO - Article title downloaded from https://www.compact-online.de/wann-kommt-die-asyl-welle-aus-dem-jemen-amis-und-saudis-haben-das-land-verwuestet-und-weitere-topmeldungen-vom-15-august/
2021-12-07 08:27:04,690 - root - INFO - Article title downloaded from https://schluess

2021-12-07 08:27:36,975 - root - INFO - Article title downloaded from http://info-direkt.eu/2017/12/23/erstmals-migranten-aus-afrika-nach-italien-eingeflogen/
2021-12-07 08:27:37,948 - root - INFO - Article title downloaded from http://www.truth24.net/schwarzfahrender-armutsmigrant-in-bomberjacke-bedroht-polizistin-mit-kuechenmesser/
2021-12-07 08:27:38,429 - root - INFO - Article title downloaded from https://opposition24.com/heulsusen-alarm-wenn-staatsfunker/366551
2021-12-07 08:27:38,435 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/projekt-todesborn-fluechtlinge-sollen-mit-deutschen-schuelerinnen-verpaart-werden/
2021-12-07 08:27:38,725 - root - INFO - Article title downloaded from http://zuerst.de/2017/11/27/schwedische-protestanten-im-gender-rausch-heiliger-geist-wird-weiblich-und-gott-geschlechtsneutral/
2021-12-07 08:27:39,189 - root - INFO - Article title downloaded from https://opposition24.com/afd-politiker-berg-studie/393198
2021-12-07 08:27:42,0

2021-12-07 08:28:07,297 - root - INFO - Article title downloaded from https://dieunbestechlichen.com/2017/12/die-ideologie-industrie-kampf-gegen-rechts-bringt-linken-organisationen-100-mio-aus-steuergeldern/
2021-12-07 08:28:07,445 - root - INFO - Article title downloaded from http://smopo.ch/frankreich-keine-post-fuer-migranten/
2021-12-07 08:28:08,263 - root - INFO - Article title downloaded from http://info-direkt.eu/2018/01/20/fp-politiker-kontern-asyl-vorwuerfe/
2021-12-07 08:28:08,316 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/jeden-tag-werden-in-deutschland-10-menschen-von-fluechtlingen-abgestochen/
2021-12-07 08:28:08,781 - root - INFO - Article title downloaded from https://opposition24.com/der-marsch-wenn-staat/410354
2021-12-07 08:28:09,912 - root - INFO - Article title downloaded from http://www.truth24.net/suedlaender-entbloesst-sich-vor-2-maedchen-13-und-onaniert-schon-wieder-in-essen/
2021-12-07 08:28:10,150 - root - INFO - Article title do

2021-12-07 08:28:40,434 - root - INFO - Article title downloaded from http://new.euro-med.dk/20170511-schweden-kriminelle-pharisaische-herrenvolks-juden-erpressen-erfolgreich-schwedische-regierung-mittels-einer-gelogenen-holocaust-mitschuld-anschuldigungfur-geld-fur-ihre-nwo-vernichtung-westlicher-k.php
2021-12-07 08:28:40,473 - root - INFO - Article title downloaded from http://smopo.ch/jetzt-dreht-bruessel-voellig-durch/
2021-12-07 08:28:41,455 - root - INFO - Article title downloaded from http://www.rapefugees.net/fluechtling-zerschneidet-omi-gesicht-vergewaltigt-und-beraubt-sie-die-medien-schweigen-heilbronn/
2021-12-07 08:28:42,040 - root - INFO - Article title downloaded from http://www.guidograndt.de/2018/01/10/kollegenbeitrag-vorsicht-bitcoins/
2021-12-07 08:28:49,097 - root - INFO - Article title downloaded from http://new.euro-med.dk/20171223-weihnachten-wird-in-interreligiosen-kirchen-zur-feier-der-geburt-des-mohammed.php
2021-12-07 08:28:49,103 - root - INFO - Article title

2021-12-07 08:29:31,289 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/1000-fallzunahme-fluechtlinge-vergraetzen-ganze-landstriche-medikamente-alle/
2021-12-07 08:29:31,747 - root - INFO - Article title downloaded from https://opposition24.com/peinliches-ablenkungsmanoever-netzgemeinde-causa/409389
2021-12-07 08:29:31,750 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/fluechtlingshelferin-fluechtet-vor-vergewaltigungen-durch-fluechtlinge-nach-polen/
2021-12-07 08:29:34,863 - root - INFO - Article title downloaded from http://noch.info/2016/05/ausserordenliche-nachricht-kurden-brachten-mit-einer-russischen-rakete-ein-tuerkisches-f-16-nach-unten/
2021-12-07 08:29:37,448 - root - INFO - Article title downloaded from https://dieunbestechlichen.com/2017/12/steigende-butterpreise-machen-weihnachtsgebaeck-teuer-wie-nie-schuld-ist-die-eu/
2021-12-07 08:29:38,581 - root - INFO - Article title downloaded from http://www.truth24.net/offenbar-p

2021-12-07 08:30:01,599 - root - INFO - Article title downloaded from http://www.journalistenwatch.com/2017/12/20/weihnachten-2017-mer-bewache-dae-dom-en-koelle/
2021-12-07 08:30:02,774 - root - INFO - Article title downloaded from http://www.rapefugees.net/sex-jihadist-zieht-maedchen-10-vom-fahrrad-und-vergewaltigt-es-im-gebuesch-brutal-leipzig/
2021-12-07 08:30:03,388 - root - INFO - Article title downloaded from http://www.guidograndt.de/2018/02/08/kollegenbeitrag-die-grosse-familiennachzugs-verarsche/
2021-12-07 08:30:03,456 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/mainz-kind-in-lebensgefahr-nach-gruppenvergewaltigung-durch-drei-islamisten/
2021-12-07 08:30:03,763 - root - INFO - Article title downloaded from http://www.epochtimes.de/politik/deutschland/lueneburg-gymnasium-verschiebt-weihnachtsfeier-muslimin-stoerten-die-lieder-a2299536.html
2021-12-07 08:30:03,869 - root - INFO - Article title downloaded from http://smopo.ch/deutschland-macht-wiede

2021-12-07 08:30:31,721 - root - INFO - Article title downloaded from https://philosophia-perennis.com/2017/10/25/linksextremer-terror-in-halle/
2021-12-07 08:30:32,787 - root - INFO - Article title downloaded from http://www.journalistenwatch.com/2018/01/08/kika-der-kinderschaender-kanal/
2021-12-07 08:30:32,841 - root - INFO - Article title downloaded from http://smopo.ch/deutschland-streicht-die-rede-und-meinungsfreiheit-aus-dem-grundgesetz/
2021-12-07 08:30:33,831 - root - INFO - Article title downloaded from https://dieunbestechlichen.com/2017/12/ohne-auffanglager-in-afrika-wird-europa-bald-kollabieren/
2021-12-07 08:30:33,903 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/wegen-dumm-und-faulheit-fluechtlingsversagen-auf-arbeitsmarkt/
2021-12-07 08:30:33,913 - root - INFO - Article title downloaded from https://blog.halle-leaks.de/araber-reisst-18jaehrige-in-hamburg-am-montag-demonstriert-die-antifa-fuer-mehr-vergewaltigungen/
2021-12-07 08:30:34,853 - r

2021-12-07 08:30:55,415 - root - INFO - Article text downloaded from https://www.bayern-depesche.de/medien/zdf-heute-show-die-unertr%C3%A4gliche-hetze-des-oliver-welke.html
2021-12-07 08:30:56,716 - root - INFO - Article text downloaded from http://www.rapefugees.net/ekelhaft-schwarzafrikaner-vergewaltigt-frau-vor-den-augen-des-partners-auf-zeltplatz/
2021-12-07 08:31:03,786 - root - INFO - Article text downloaded from http://new.euro-med.dk/20180103-eu-luftbrucke-fur-tausende-von-niedrig-iq-muslimen-aus-nordafrika-nach-europa-skandinavische-iq-sinkt-nun-um-6-5-pro-generation.php
2021-12-07 08:31:04,314 - root - INFO - Article text downloaded from http://blauerbote.com/2017/04/02/pfefferspray-gegen-sitzblockade-polizeigewalt-in-thueringen/
2021-12-07 08:31:04,745 - root - INFO - Article text downloaded from http://www.allesroger.at/artikel/armut-in-oesterreich
2021-12-07 08:31:05,791 - root - INFO - Article text downloaded from http://www.journalistenwatch.com/2018/01/11/sondierungen-o

2021-12-07 08:31:40,236 - root - INFO - Article text downloaded from https://www.unzensuriert.at/content/0025635-Fregatte-Mecklenburg-Vorpommern-kooperiert-mit-NGO-Schiffen-als-Migranten-Taxi
2021-12-07 08:31:41,179 - root - INFO - Article text downloaded from http://www.anonymousnews.ru/2017/10/13/gefaengnisse-abschaffen-gruene-wollen-90-prozent-der-haeftlinge-in-offenen-vollzug-entlassen/
2021-12-07 08:31:41,462 - root - INFO - Article text downloaded from http://www.epochtimes.de/politik/europa/polizeichef-von-schweden-warnt-die-regierung-hat-die-kontrolle-ueber-das-land-verloren-integration-gescheitert-a2156608.html
2021-12-07 08:31:41,503 - root - INFO - Article text downloaded from https://blog.halle-leaks.de/bestialischer-ueberfall-fluechtling-enthauptet-oma-98-beim-klau-der-halskette-fast/
2021-12-07 08:31:41,518 - root - INFO - Article text downloaded from https://blog.halle-leaks.de/rezession-beendet-keine-arbeitslosigkeit-mehr-betteln-ist-arbeit/
2021-12-07 08:31:42,436 - ro

2021-12-07 08:32:11,007 - root - INFO - Article text downloaded from http://www.truth24.net/asylant-aus-schwarzafrika-vergewaltigt-frankenthalerin-27-am-neujahrsmorgen/
2021-12-07 08:32:11,104 - root - INFO - Article text downloaded from http://smopo.ch/israel-atomare-vernichtung-deutschlands-gefordert/
2021-12-07 08:32:11,109 - root - INFO - Article text downloaded from https://blog.halle-leaks.de/seit-oeffnung-der-bundeswehr-fuer-islamisten-vergewaltigungen-sprunghaft-gestiegen/
2021-12-07 08:32:11,970 - root - INFO - Article text downloaded from http://noch.info/2015/12/deutschland-schickt-taeglich-zig-busse-nach-griechenland-und-bringt-fluechtlinge-auf-einer-wenig-befahrenen-route-reisebuero-merkel-faymann/
2021-12-07 08:32:13,981 - root - INFO - Article text downloaded from http://www.epochtimes.de/gesundheit/bevoelkerungswachstum-kontrollieren-kenianische-aerzte-entdecken-sterilisationsmittel-in-impfstoffen-a1337657.html
2021-12-07 08:32:14,037 - root - INFO - Article text downlo

2021-12-07 08:32:42,145 - root - INFO - Article text downloaded from http://blauerbote.com/2017/09/21/spd-politiker-bringt-blogger-in-den-knast/
2021-12-07 08:32:43,485 - root - INFO - Article text downloaded from http://noch.info/2016/05/der-cia-agent-bin-laden-ist-lebendig-und-laesst-es-sich-auf-den-bahamas-gut-gehen-edward-snowden/
2021-12-07 08:32:45,279 - root - INFO - Article text downloaded from http://www.epochtimes.de/politik/deutschland/migranten-als-bevoelkerungsersatz-fuer-europaeer-bereits-seit-2000-offiziell-in-planung-a2276366.html
2021-12-07 08:32:46,202 - root - INFO - Article text downloaded from http://www.truth24.net/silvesterattacke-armutsfluechtling-versucht-frau-zu-vergewaltigen-schwabach/
2021-12-07 08:32:46,259 - root - INFO - Article text downloaded from http://www.noislam.de/deutschland-untergang-oder-daemmerung/
2021-12-07 08:32:47,233 - root - INFO - Article text downloaded from https://www.journalistenwatch.com/2018/02/02/familiennachzug-warum-1000-angehoe

2021-12-07 08:33:16,563 - root - INFO - Article text downloaded from https://blog.halle-leaks.de/polizei-keine-zeit-fuer-gruenen-nonsens-in-deutschland-wird-gemessert-und-vergewaltigt-durch-merkel-gaeste/
2021-12-07 08:33:16,791 - root - INFO - Article text downloaded from https://philosophia-perennis.com/2017/11/04/aegypten-nationale-pflicht-frauen-zu-vergewaltigen/
2021-12-07 08:33:17,744 - root - INFO - Article text downloaded from https://dieunbestechlichen.com/2017/12/gefaehrliche-fuehrerin-und-sogar-der-spiegel-will-nun-ein-ende-der-merkel-aera/
2021-12-07 08:33:18,606 - root - INFO - Article text downloaded from http://www.truth24.net/afrikanischer-islamist-zuendet-an-heilig-abend-hund-an-und-verbrennt-ihn-zu-tode/
2021-12-07 08:33:18,648 - root - INFO - Article text downloaded from https://blog.halle-leaks.de/abgelehnter-fluechtling-schlitzt-waerter-im-knast-mit-cuttermesser-auf/
2021-12-07 08:33:18,651 - root - INFO - Article text downloaded from https://blog.halle-leaks.de/zi

2021-12-07 08:33:45,574 - root - INFO - Article text downloaded from http://www.epochtimes.de/politik/europa/40-jahre-multikulti-in-schweden-300-mehr-gewaltverbrechen-und-1472-mehr-vergewaltigungen-a2035485.html
2021-12-07 08:33:46,025 - root - INFO - Article text downloaded from https://opposition24.com/staatsfunk-fantasiert-ueber-rechte-trollfabriken/408637
2021-12-07 08:33:46,818 - root - INFO - Article text downloaded from https://www.unzensuriert.at/content/0025783-Neue-Regierung-auf-Leistungskurs-im-Bildungswesen
2021-12-07 08:33:47,105 - root - INFO - Article text downloaded from http://www.labournet.de/politik/wipo/privatisierung/privatbildung/volksinitiative-gegen-schulprivatisierung-berlin-gestartet-schulprivatisierung-droht-2018-ganz-deutschland/?cat=6757
2021-12-07 08:33:47,480 - root - INFO - Article text downloaded from http://smopo.ch/staatengemeinschaft-trotz-mini-eiszeit-fuer-den-klimaschutz/
2021-12-07 08:33:48,474 - root - INFO - Article text downloaded from https://

2021-12-07 08:34:10,858 - root - INFO - Article text downloaded from https://opposition24.com/poggenburg-und-der-verfassungsschutz/408927
2021-12-07 08:34:11,842 - root - INFO - Article text downloaded from http://www.journalistenwatch.com/2017/10/25/zeit-fuer-fake-news/
2021-12-07 08:34:11,852 - root - INFO - Article text downloaded from https://blog.halle-leaks.de/1-jahr-nach-zuzug-von-jugendlichen-fluechtlingen-einbrueche-in-nietleben-vervierfacht/
2021-12-07 08:34:12,059 - root - INFO - Article text downloaded from http://www.epochtimes.de/politik/welt/schweden-verschaerft-sexualstrafrecht-kein-sex-mehr-ohne-genehmigung-a2300030.html
2021-12-07 08:34:12,307 - root - INFO - Article text downloaded from https://de.sott.net/article/31276-US-Panzer-Transportzug-mit-unbekanntem-Ziel-fuhr-durchs-Vogtland-Bahn-schweigt
2021-12-07 08:34:13,302 - root - INFO - Article text downloaded from https://www.journalistenwatch.com/2018/02/02/grossbritannien-nackte-frauen-zensur-hat-museen-erreicht/


2021-12-07 08:34:57,817 - root - INFO - Article text downloaded from http://www.rapefugees.net/schwarzafrikaner-onaniert-vor-polizei-bespuckt-und-tritt-die-beamten-bahnhof-osnabrueck/
2021-12-07 08:34:58,730 - root - INFO - Article text downloaded from http://www.rapefugees.net/gruppenvergewaligung-wie-die-nrw-justiz-einer-lokalzeitung-verbietet-diese-bilder-zu-veroeffentlichen/#comments
2021-12-07 08:34:58,784 - root - INFO - Article text downloaded from https://schluesselkindblog.com/2018/02/17/zwei-deutsche-wegen-behandlungen-durch-unqualifizierte-migranten-aerzte-gestorben/
2021-12-07 08:34:59,068 - root - INFO - Article text downloaded from http://zuerst.de/2018/01/29/nach-putin-jetzt-auch-netanjahu-deutschland-bestenfalls-eingeschraenkt-souveraen/
2021-12-07 08:34:59,179 - root - INFO - Article text downloaded from http://smopo.ch/kanzlerin-merkel-fuer-unbegrenzt-mehr-moslem-migranten/
2021-12-07 08:34:59,223 - root - INFO - Article text downloaded from http://smopo.ch/deutschlan

2021-12-07 08:35:31,509 - root - INFO - Article text downloaded from http://smopo.ch/un-mitarbeiter-fuer-60-000-vergewaltigungen-verantwortlich/
2021-12-07 08:35:32,463 - root - INFO - Article text downloaded from http://www.truth24.net/kiel-nach-monatelanger-vertuschung-nun-bild-eines-paedo-armutsmigranten-veroeffentlicht/
2021-12-07 08:35:33,528 - root - INFO - Article text downloaded from http://www.journalistenwatch.com/2017/12/19/felix-austria-der-juengste-regierungschef-europas-als-merkels-schreckgespenst/
2021-12-07 08:35:33,534 - root - INFO - Article text downloaded from https://schluesselkindblog.com/2017/12/07/fluechtlingsrat-will-kriminellen-migranten-mehr-handlungsspielraum-lassen/
2021-12-07 08:35:34,617 - root - INFO - Article text downloaded from http://www.journalistenwatch.com/2017/12/22/malmoe-demo-gegen-gruppenvergewaltigungen/
2021-12-07 08:35:35,297 - root - INFO - Article text downloaded from http://www.anonymousnews.ru/2017/10/28/schwarze-kassen-merkels-cdu-erhi

2021-12-07 08:36:02,662 - root - INFO - Article text downloaded from http://blauerbote.com/2017/12/09/landgericht-hamburg-wer-fake-news-aufdeckt-macht-sich-strafbar/
2021-12-07 08:36:03,030 - root - INFO - Article text downloaded from http://www.allesroger.at/artikel/der-alternative-jahresrueckblick-von-alles-roger
2021-12-07 08:36:03,183 - root - INFO - Article text downloaded from https://perspektive-online.net/2018/02/donauwoerth-abgeschobene-fluechtlinge-wollen-nach-italien-ausreisen-polizei-hindert-sie/
2021-12-07 08:36:03,890 - root - INFO - Article text downloaded from http://www.guidograndt.de/2017/11/22/horror-innere-sicherheit-ferngesteuerte-fahrzeuge-mit-sprengstoff-neue-attentatsplaene-von-is-terroristen-in-deutschland/
2021-12-07 08:36:03,976 - root - INFO - Article text downloaded from https://blog.halle-leaks.de/rotrotgruenes-schmarotzer-berlin-baut-24-neue-siedlungen-fuer-50-000-goldstuecke/
2021-12-07 08:36:03,983 - root - INFO - Article text downloaded from https://bl

2021-12-07 08:36:21,961 - root - INFO - Shape of final dataframe: (232, 12)
2021-12-07 08:36:21,961 - root - INFO - dtypes: 
Date                          datetime64[ns]
URL                                   object
False_Statement_1_Location            object
False_Statement_1_Index               object
False_Statement_2_Location            object
False_Statement_2_Index               object
False_Statement_3_Location            object
False_Statement_3_Index               object
Ratio_of_Fake_Statements               int64
Overall_Rating                       float64
titel                                 object
text                                  object
dtype: object
2021-12-07 08:36:21,974 - root - INFO - Rows with null values: 
Date                          17
URL                            0
False_Statement_1_Location     0
False_Statement_1_Index        0
False_Statement_2_Location     0
False_Statement_2_Index        0
False_Statement_3_Location     0
False_Statement_3_Index   

#### Prepare the news.csv for individual (not merged with GermanFakeNC) use
This merely drops columns Kategorie, Quelle and Art and removes duplicates from the news.csv

In [9]:
def prepare_news_csv(filepath):
    """ 
    1.) Drop columns -> Kategorie, Quelle, Art
    2.) Check on duplicate Titel and Body and drop the first entry of duplicates
    3.) Rename Columns in order to match it with the other dataset (GermanFakeNC)
    4.) Add column source_name with news_csv to identifiy the source of a row after merging
    """

    # Read news.csv from disk
    _df = pd.read_csv(filepath)
    logger.debug(_df.info())
    # Drop cols
    logger.info('Null values in news.csv: \n%s' % _df.isnull().sum())
    cols_to_drop = ['Kategorie', 'Quelle', 'Art']
    cols_to_drop = ['Art']
    _df.drop(cols_to_drop, axis=1, inplace=True)
    logger.info('Cols %s dropped' % cols_to_drop)


    # Drop duplicates
    logger.info('Percent duplicated Titel and Body: \n%s' % str(
        _df.duplicated(subset=['Titel', 'Body']).value_counts(normalize=True)))
    _df.drop_duplicates(subset=['Titel', 'Body'], inplace=True)
    logger.info('Duplicates in Titel and Body dropped')

    return _df

#     logger.info('Shape: %s\n Columns: %s' % (df_1.shape, df_1.columns))
#     logger.info('Shape: %s\n Columns: %s' % (df_2.shape, df_2.columns))
#     # Check col names
#     sym_diff = set(df_1).symmetric_difference(set(df_2))
#     assert len(sym_diff) == 0 , 'Differences in colnames of the two datasets'
#     return pd.concat([df_1, df_2], axis=0, ignore_index=True)



log_fmt = '%(asctime)s - %(name)s - %(levelname)s : %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger()

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
load_dotenv(find_dotenv())

NEWS_CSV = os.path.join('news.csv')
OUTPUT = os.path.join('news_cleaned.csv')

df_news = prepare_news_csv(NEWS_CSV)

try:
    df_news.to_csv(OUTPUT, sep=';', index=False)
    logger.info('Final dataset prepared and saved to %s' % OUTPUT)
except Exception:
    logger.exception('File could not be daved to disk\n', exc_info=True )

2021-12-07 07:58:31,336 - root - INFO - Null values in news.csv: 
id               0
url              0
Titel            0
Body             0
Kategorie     1322
Datum            0
Quelle           0
Fake             0
Art          40972
dtype: int64
2021-12-07 07:58:31,343 - root - INFO - Cols ['Kategorie', 'Quelle', 'Art'] dropped


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63868 entries, 0 to 63867
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         63868 non-null  int64 
 1   url        63868 non-null  object
 2   Titel      63868 non-null  object
 3   Body       63868 non-null  object
 4   Kategorie  62546 non-null  object
 5   Datum      63868 non-null  object
 6   Quelle     63868 non-null  object
 7   Fake       63868 non-null  int64 
 8   Art        22896 non-null  object
dtypes: int64(2), object(7)
memory usage: 4.4+ MB


2021-12-07 07:58:31,783 - root - INFO - Percent duplicated Titel and Body: 
False    0.980444
True     0.019556
dtype: float64
2021-12-07 07:58:32,076 - root - INFO - Duplicates in Titel and Body dropped
2021-12-07 07:58:34,189 - root - INFO - Final dataset prepared and saved to news_cleaned.csv


#### Combine GermanFakeNC_interim.csv with news.csv
This unifies both datasets and merges them.
All samples from the GermanFakeNC are treated at "fake", despite some of them having low "over all rating" in regards to fakeness

In [8]:
# Functions


def prepare_news_csv(filepath):
    """ 
    1.) Drop columns -> Kategorie, Quelle, Art
    2.) Check on duplicate Titel and Body and drop the first entry of duplicates
    3.) Rename Columns in order to match it with the other dataset (GermanFakeNC)
    4.) Add column source_name with news_csv to identifiy the source of a row after merging
    """

    # Read news.csv from disk
    _df = pd.read_csv(filepath)
    logger.debug(_df.info())
    # Drop cols
    logger.info('Null values in news.csv: \n%s' % _df.isnull().sum())
    cols_to_drop = ['Kategorie', 'Quelle', 'Art']
    _df.drop(cols_to_drop, axis=1, inplace=True)
    logger.info('Cols %s dropped' % cols_to_drop)


    # Drop duplicates
    logger.info('Percent duplicated Titel and Body: \n%s' % str(
        _df.duplicated(subset=['Titel', 'Body']).value_counts(normalize=True)))
    _df.drop_duplicates(subset=['Titel', 'Body'], inplace=True)
    logger.info('Duplicates in Titel and Body dropped')

    # Rename Cols
    new_cols = {'id': 'src_id',
                'Titel': 'title',
                'Body': 'text',
                'Datum': 'date',
                'Fake': 'fake'}
    _df.rename(columns=new_cols, inplace=True)
    logger.info('Cols renamed')

    # Add col source_name
    _df['src_name'] = 'news_csv'

    return _df


def prepare_germanfake(filepath):
    """ 
    1.) Drop columns -> [False_Statement_1_Location,
                         False_Statement_1_Index,
                         False_Statement_2_Location,
                         False_Statement_2_Index,
                         False_Statement_3_Location,
                         False_Statement_3_Index,
                         Ratio_of_Fake_Statements,
                         Overall_Rating]
        We treat all entries as fakenews, eventhough there are some instances
        that have a very low fake overall ratings!!
    2.) Make index source_id
    3.) Check on duplicate titel and text and drop the first entry of duplicates
    4.) Drop rows where titel or text is null 
    5.) Fill Dates for missing values -> From the URL we can see that the Date could
        be 2017/12 
    6.) Rename Columns in order to match it with the other dataset (news.csv)
    7.) Add label col 'fake' = 1 -> all 1; col 'src_name' = 'GermanFakeNC'
    """

    # Read news.csv from disk
    _df = pd.read_csv(filepath)
    logger.debug(_df.info())
    # Drop cols
    logger.info('Null values in GermanFakeNC_interim.csv: \n%s' % _df.isnull().sum())
    cols_to_drop = ['False_Statement_1_Location',
                    'False_Statement_1_Index',
                    'False_Statement_2_Location',
                    'False_Statement_2_Index',
                    'False_Statement_3_Location',
                    'False_Statement_3_Index',
                    'Ratio_of_Fake_Statements',
                    'Overall_Rating']
    _df.drop(cols_to_drop, axis=1, inplace=True)
    logger.info('Cols %s dropped' % cols_to_drop)

    # Set source_id
    _df.reset_index(inplace=True)
    logger.info('Index reset')
    
    # Drop duplicates
    logger.info('Percent duplicated titel and text: \n%s' % str(
        _df.duplicated(subset=['titel', 'text']).value_counts(normalize=True)))
    _df.drop_duplicates(subset=['titel', 'text'], inplace=True)
    logger.info('Duplicates in titel and text dropped')

    # Drop rows where titel or text is null
    _df.dropna(subset=['titel', 'text'], inplace=True)
    logger.info('Null rows for titel and text dropped')

    # Fill the missing dates
    _df['Date'].fillna(pd.to_datetime('01/12/2017'), inplace=True)

    # Rename Cols
    new_cols = {'index': 'src_id',
                'titel': 'title',
                'Date': 'date',
                'URL': 'url'}
    _df.rename(columns=new_cols, inplace=True)
    logger.info('Cols renamed')

    # Add col source_name
    _df['fake'] = 1
    _df['src_name'] = 'GermanFakeNC'

    return _df


def merge_datasets(df_1, df_2):
    logger.info('Shape: %s\n Columns: %s' % (df_1.shape, df_1.columns))
    logger.info('Shape: %s\n Columns: %s' % (df_2.shape, df_2.columns))
    # Check col names
    sym_diff = set(df_1).symmetric_difference(set(df_2))
    assert len(sym_diff) == 0 , 'Differences in colnames of the two datasets'
    return pd.concat([df_1, df_2], axis=0, ignore_index=True)



log_fmt = '%(asctime)s - %(name)s - %(levelname)s : %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger()

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
load_dotenv(find_dotenv())

NEWS_CSV = os.path.join('news.csv')
GERMAN_FAKE_NC = os.path.join('GermanFakeNC_interim.csv')
OUTPUT = os.path.join('datasets_merged.csv')

df_news = prepare_news_csv(NEWS_CSV)
df_gfn = prepare_germanfake(GERMAN_FAKE_NC)
df_merged = merge_datasets(df_news, df_gfn)

try:
    df_merged.to_csv(OUTPUT, sep=';', index=False)
    logger.info('Final dataset prepared and saved to %s' % OUTPUT)
except Exception:
    logger.exception('File could not be daved to disk\n', exc_info=True )

2021-12-07 07:46:55,076 - root - INFO - Null values in news.csv: 
id               0
url              0
Titel            0
Body             0
Kategorie     1322
Datum            0
Quelle           0
Fake             0
Art          40972
dtype: int64
2021-12-07 07:46:55,076 - root - INFO - Cols ['Kategorie', 'Quelle', 'Art'] dropped


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63868 entries, 0 to 63867
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         63868 non-null  int64 
 1   url        63868 non-null  object
 2   Titel      63868 non-null  object
 3   Body       63868 non-null  object
 4   Kategorie  62546 non-null  object
 5   Datum      63868 non-null  object
 6   Quelle     63868 non-null  object
 7   Fake       63868 non-null  int64 
 8   Art        22896 non-null  object
dtypes: int64(2), object(7)
memory usage: 4.4+ MB


2021-12-07 07:46:55,527 - root - INFO - Percent duplicated Titel and Body: 
False    0.980444
True     0.019556
dtype: float64
2021-12-07 07:46:55,834 - root - INFO - Duplicates in Titel and Body dropped
2021-12-07 07:46:55,835 - root - INFO - Cols renamed
2021-12-07 07:46:55,855 - root - INFO - Null values in GermanFakeNC_interim.csv: 
Date                           17
URL                             0
False_Statement_1_Location      0
False_Statement_1_Index        14
False_Statement_2_Location     79
False_Statement_2_Index        83
False_Statement_3_Location    146
False_Statement_3_Index       148
Ratio_of_Fake_Statements        0
Overall_Rating                  0
titel                           0
text                            3
dtype: int64
2021-12-07 07:46:55,856 - root - INFO - Cols ['False_Statement_1_Location', 'False_Statement_1_Index', 'False_Statement_2_Location', 'False_Statement_2_Index', 'False_Statement_3_Location', 'False_Statement_3_Index', 'Ratio_of_Fake_Statemen

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Date                        215 non-null    object 
 1   URL                         232 non-null    object 
 2   False_Statement_1_Location  232 non-null    object 
 3   False_Statement_1_Index     218 non-null    object 
 4   False_Statement_2_Location  153 non-null    object 
 5   False_Statement_2_Index     149 non-null    object 
 6   False_Statement_3_Location  86 non-null     object 
 7   False_Statement_3_Index     84 non-null     object 
 8   Ratio_of_Fake_Statements    232 non-null    int64  
 9   Overall_Rating              232 non-null    float64
 10  titel                       232 non-null    object 
 11  text                        229 non-null    object 
dtypes: float64(1), int64(1), object(10)
memory usage: 21.9+ KB


2021-12-07 07:46:58,072 - root - INFO - Final dataset prepared and saved to datasets_merged.csv
