# Data Preprocessing


In [11]:
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import wordnet
from langdetect import detect
import progressbar

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
tknzr = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/larsmoellerherm/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/larsmoellerherm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Getting to know the Dataset

Es gibt einige Texte die nicht auf Englisch sind und ein paar, die nur leerzeichen, nans,... enthalten.
Diese werden erst gelöscht

In [15]:
news = pd.read_csv('../data/mixed_news/news_dataset.csv')
print("Anzahl an verschiedenen publischern von real news: ",np.unique(news[news.label=="real"].publication).shape[0])

Anzahl an verschiedenen publischern von real news:  10


In [4]:
news = news.drop(["title","Unnamed: 0","publication"],axis=1)
news = news.dropna()

wrong_indexes = []
counter = 0
for text in progressbar.progressbar(news.content):
    try:
        if detect(text) != 'en':
            wrong_indexes.append(counter)
    except:
        wrong_indexes.append(counter)
    counter +=1


100% (28665 of 28665) |##################| Elapsed Time: 0:11:27 Time:  0:11:27


In [10]:
news = news.drop(wrong_indexes)

In [15]:
news.head()

Unnamed: 0,content,label
0,Print They should pay all the back all the mon...,fake
1,Why Did Attorney General Loretta Lynch Plead T...,fake
2,Red State : \nFox News Sunday reported this mo...,fake
3,Email Kayla Mueller was a prisoner and torture...,fake
4,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,fake


In [12]:
print("Fake News %.3f prozent" % (news[news.label=='fake'].shape[0]/news.shape[0]))
print("Real News %.3f prozent" % (news[news.label=='real'].shape[0]/news.shape[0]))
print("Gesamtgröße des Datasets: %i" % news.shape[0])

Fake News 0.437 prozent
Real News 0.563 prozent
Gesamtgröße des Datasets: 27903


Lemmatizing

In [16]:
punctuations="?:!.,;/"
class Splitter(object):
    """
    split the document into sentences and tokenize each sentence
    """
    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):
        """
        out : ['What', 'can', 'I', 'say', 'about', 'this', 'place', '.']
        """
        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(sent) for sent in sentences]
        return tokens
    
spl = Splitter()

def get_wordnet_pos(treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

def lemmatizing(s):
    tokens = spl.split(s)
    pos_tokens = [nltk.pos_tag(token) for token in tokens]
    pos_t = [[lemmatizer.lemmatize(word,get_wordnet_pos(pos_tag)) for (word,pos_tag) in pos if word not in punctuations] for pos in pos_tokens]
    sentences = [" ".join(x) for x in pos_t]
    return " ".join(sentences)


In [17]:
news['lem_content'] = news['content'].apply(lambda x: lemmatizing(x))

In [18]:
print(news.lem_content[2])
print(news.content[2])

Red State Fox News Sunday report this morning that Anthony Weiner be cooperate with the FBI which have re-opened ( yes lefty “ re-opened ” ) the investigation into Hillary Clinton ’ s classify email Watch a Chris Wallace report the break news during the panel segment near the end of the show And the news be break while we ’ re on the air Our colleague Bret Baier have just send u an e-mail saying he have two source who say that Anthony Weiner who also have co-ownership of that laptop with his estranged wife Huma Abedin be cooperate with the FBI investigation have give them the laptop so therefore they didn ’ t need a warrant to get in to see the content of say laptop Pretty interesting development Targets of federal investigation will often cooperate hop that they will get consideration from a judge at sentence Given Weiner ’ s well-known penchant for lie it ’ s hard to believe that a prosecutor would give Weiner a deal base on an agreement to testify unless his testimony be very strong

In [19]:
news.to_hdf("../build/preprocessed/lemmatized_news.hdf5",key="data")

## Bag of Words

take dim most used words

In [5]:
dim = 500
news = pd.read_hdf("../build/preprocessed/lemmatized_news.hdf5",key="data")
news.head()

Unnamed: 0,content,label,lem_content
0,Print They should pay all the back all the mon...,fake,Print They should pay all the back all the mon...
1,Why Did Attorney General Loretta Lynch Plead T...,fake,Why Did Attorney General Loretta Lynch Plead T...
2,Red State : \nFox News Sunday reported this mo...,fake,Red State Fox News Sunday report this morning ...
3,Email Kayla Mueller was a prisoner and torture...,fake,Email Kayla Mueller be a prisoner and torture ...
4,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,fake,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...


In [6]:
seed = 42
X_train, X_test, y_train, y_test = train_test_split(news.lem_content,  news.label, test_size=0.3, random_state=seed, shuffle=True, stratify=news.label)

vectorizer = CountVectorizer(max_features=dim, ngram_range=(1,1))
vectorizer.fit(X_train)

used_words = vectorizer.get_feature_names()

X_train_bow = vectorizer.transform(X_train).toarray()
X_test_bow = vectorizer.transform(X_test).toarray()

In [7]:
LE = LabelEncoder()
LE.fit(["fake","real"])
y_train_enc = LE.transform(y_train)
y_test_enc = LE.transform(y_test)

x_train = pd.DataFrame(data=X_train_bow,columns=used_words)
x_train["label"] = y_train_enc
x_test = pd.DataFrame(data=X_test_bow,columns=used_words)
x_test["label"] = y_test_enc

x_train.to_hdf("../build/preprocessed/bow_data_500.hdf5",key="train")
x_test.to_hdf("../build/preprocessed/bow_data_500.hdf5",key="test")