# Analisador de Fake News


///////////////// Introdução
//////////////// Sumário
1. Importações
2. Leitura dos Dados
3. Limpeza e Transformações dos Dados
4. Redução de Dimensionalidade dos Dados
5. Visualizações dos dados
6. Avaliação dos Modelos
7. Conclusão

# 1. Importações

In [1]:

from math import log, pi, sqrt
import pandas as pd
import numpy as np
from scipy.linalg import svd 
from textblob import TextBlob, Word
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop = stopwords.words('english')
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import multivariate_normal
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
#import pydotplus -> baixar esse cara
from sklearn import datasets, svm
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
np.random.seed(12345)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Leitura dos Dados

In [3]:
dataset = pd.read_csv("data/data.csv") # será utilizado para computar coisas nas visualizações
dataset['Body'] = dataset['Body'].astype(str) 
data = dataset # será limpado e utilizado nos modelos
data.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


# 3. Limpeza dos Dados

In [4]:
# Deixar tudo em lower case
data['Headline'] = data['Headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data['Body'] = data['Body'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
# Removendo pontuação
data['Headline'] = data['Headline'].str.replace('[^\w\s]','')
data['Body'] = data['Body'].str.replace('[^\w\s]','')
# Removendo stopwords
data['Headline'] = data['Headline'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data['Body'] = data['Body'].apply(lambda x: " ".join(x for x in str(x).split() if str(x) not in stop))
# 10 Palavras mais frequentes e menos frequentes
freqHeadline = pd.Series(' '.join(data['Headline']).split()).value_counts()[:10]
freqBody = pd.Series(' '.join(data['Body']).split()).value_counts()[:10]
rareHead = pd.Series(' '.join(data['Headline']).split()).value_counts()[-10:]
rareBody = pd.Series(' '.join(data['Body']).split()).value_counts()[-10:]
# Remoção das palavras raras e frequentes
freqHeadline = list(freqHeadline.index)
freqBody = list(freqBody.index)
data['Headline'] = data['Headline'].apply(lambda x: " ".join(x for x in x.split() if x not in freqHeadline))
data['Body'] = data['Body'].apply(lambda x: " ".join(x for x in x.split() if x not in freqBody))
data['Headline'] = data['Headline'].apply(lambda x: " ".join(x for x in x.split() if x not in rareHead))
data['Body'] = data['Body'].apply(lambda x: " ".join(x for x in x.split() if x not in rareBody))
# Lemmatization. Extrai o núcleo/root da palavra
data['Headline'] = data['Headline'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data['Body'] = data['Body'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,four way bob corker skewered donald,image copyright getty image sunday morning don...,1
1,https://www.reuters.com/article/us-filmfestiva...,linklaters war veteran comedy speaks modern am...,london reuters last flag flying comedydrama vi...,1
2,https://www.nytimes.com/2017/10/09/us/politics...,trump fight corker jeopardizes legislative agenda,feud broke public view last week mr corker mr ...,1
3,https://www.reuters.com/article/us-mexico-oil-...,egypt cheiron win tieup pemex mexican onshore ...,mexico city reuters egypt cheiron holding limi...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,jason aldean open snl tribute,country singer jason aldean performing la vega...,1


In [5]:
vectorizer = CountVectorizer()
vectorizer.fit(data['Headline']) # tokenize and build vocabulary
vector = vectorizer.transform(data['Headline']) # encode document
print("Tamanho: ", vector.shape)
head = [key for key in  vectorizer.vocabulary_]
df = pd.DataFrame(vector.toarray(), columns = head)
df['Label'] = data['Label']
df.head()

Tamanho:  (4009, 6633)


Unnamed: 0,four,way,bob,corker,skewered,donald,linklaters,war,veteran,comedy,...,tragic,hervé,leroux,léger,bandage,dress,30minute,100000,antiamerican,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# 4. Redução da Dimensionalidade

In [12]:
from scipy.linalg import svd 

def fisher_score(x, y):
    mean = np.mean(x, axis=0)
    classes = np.unique(y)
    sS = 0
    sD = 0
    Nk = []
    
    meanD = []
    varD = []
    
    for k in classes:
        elements = []
        s = 0
        for i in range(0, len(y)):
            if(y[i] == k):
                s += 1
                elements.append(x[i])
        meanD.append(np.mean(elements, axis=0))
        varD.append(np.var(elements, axis=0))
        Nk.append(s)
    
    for k in range(0, len(classes)):
        sS += (Nk[k] * ((meanD[k] - mean)**2))
        sD += (Nk[k] * varD[k])
    return sS/sD


def compute(x):
    mean = np.mean(x, axis = 0)
    cov = np.cov(np.transpose(x))

    U, S, V = svd(cov)
    S = np.diag(S)
    M = U @ S @ V
    P = np.transpose(U)
    return {'S': S, 'U': U, 'V': V, 'M': M, 'P': P}

def transform(x, rateVariance):
    pca_result = compute(x)
    
    S = pca_result['S']
    P = pca_result['P']
    
 #   autovalores = np.sort([S[i][i] for i in range(len(S))])[::-1]
    #varianciaExplicadaTot = np.sum(np.array(autovalores))
    #varianceExpl = rateVarience*varianciaExplicadaTot

    #matrizTransform = []
    
    matrizTransform = []
    varianciaExplicadaTot = 0
    varianceExpl = 0
    autoValores = []
    for i in range(0, S.shape[0]):
        varianciaExplicadaTot += S[i,i]
        autoValores.append(S[i,i])
    varianceExpl = rateVariance *varianciaExplicadaTot

    autoValOrdenados = np.sort(autoValores)
    autoValOrdenados = autoValOrdenados[::-1]
    
    varianceAtual = 0
    i = 0
    while(varianceAtual<=varianceExpl and i<autoValOrdenados.shape[0]):
        for j in range (0, len(autoValores)):
            if(autoValOrdenados[i]==autoValores[j]):
                matrizTransform.append(P[j])
        varianceAtual += autoValOrdenados[i]
        i += 1
    print(matrizTransform)
    return {'P': matrizTransform, 'Z':np.transpose(np.dot(matrizTransform, np.transpose(x)))}

In [9]:
teste = transform(df, 0.9)

[array([-5.19070206e-04,  4.03975254e-03,  2.05923478e-03, ...,
       -1.87862052e-04,  4.58526830e-04, -9.74389294e-01]), array([ 7.78241368e-06,  4.91268123e-02,  1.00644981e-02, ...,
       -4.97116379e-04, -3.75757313e-04,  6.91643228e-02]), array([-5.73852348e-05, -1.11270942e-02, -2.21894963e-03, ...,
        5.12885174e-04,  6.93742996e-04, -8.08178000e-02]), array([ 2.54469513e-05, -1.64840211e-02,  1.59958358e-03, ...,
        9.78481211e-04, -2.54073130e-03, -5.12303086e-02]), array([-9.32088847e-06,  4.96865496e-02, -1.44919743e-02, ...,
        1.19782574e-04, -2.62000344e-04, -1.81912913e-02]), array([ 5.18832562e-05, -9.91814820e-03, -1.55060653e-02, ...,
       -2.83999691e-04,  1.12641814e-05,  3.45444517e-02]), array([-8.85469966e-05,  1.30003141e-02, -4.51324767e-04, ...,
        1.71305764e-04,  2.25163849e-04,  2.65949010e-02]), array([ 5.45277636e-05, -5.11432716e-03,  4.34251629e-03, ...,
        2.21227971e-04,  1.64707641e-04, -2.38780822e-02]), array([-7.59477

In [17]:
data_pca = pd.DataFrame(teste['Z'])
data_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1411,1412,1413,1414,1415,1416,1417,1418,1419,1420
0,-0.984136,0.099139,-0.061099,-0.01209,-0.05248,0.014151,0.007415,0.038413,-0.000704,-0.030789,...,-0.001678,0.020232,0.000472,-0.00924,0.001888,0.037035,-0.016372,0.029633,-0.022215,-0.005286
1,-0.997899,0.049864,-0.078091,-0.042231,-0.02181,0.017319,0.017306,-0.01577,0.026304,-0.003228,...,-0.012693,-0.010049,-0.003099,-0.015711,0.032297,-0.031333,-0.022945,-0.019722,-0.013273,0.045392
2,-0.980637,0.041399,-0.039046,0.015389,0.003467,-0.018277,0.056461,-0.000112,-0.015045,-0.105576,...,0.014475,-0.106912,-0.024337,0.004951,-0.01209,-0.090093,-0.036275,-0.032207,-0.009252,0.071011
3,-0.946826,0.820671,-0.232496,-0.163384,0.05171,-0.533316,0.017611,-0.146874,0.209861,-0.264192,...,0.036815,0.026961,-0.000106,-0.078252,0.03377,-0.028676,-0.070362,-0.007647,0.001665,-0.002142
4,-1.008231,0.063513,-0.075693,-0.032435,-0.01898,0.027904,0.067577,-0.072386,-0.040464,0.013274,...,-0.004053,0.003594,0.002697,-0.016925,0.027376,0.007729,0.015099,0.003263,0.006775,-0.002134


In [18]:
fisher_scores = fisher_score(df.iloc[:,:-1].values, df['Label'].values)
qnt = 1000 # quantas colunas quer dropar pelo fisher score
indices = [] # armazena os indices dos menores valores do fisher_score -> que é o que vamos dropar
for i in range(qnt): # faz o processamento de armazenar os indices dos menores valores
    indice = np.argmin(fisher_scores)
    fisher_scores = np.delete(fisher_scores, indice)
    indices.append(indice)
data_fisher = df.drop([df.columns[i] for i in indices] ,  axis='columns')
data_fisher.head()

Unnamed: 0,four,way,bob,corker,skewered,donald,linklaters,war,veteran,comedy,...,tragic,hervé,leroux,léger,bandage,dress,30minute,100000,antiamerican,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# 5. Visuzaliações dos dados

# 6. Aplicando Algoritmos