# Data 준비

In [3]:
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np

In [4]:
df=pd.read_csv('data/fake_or_real_news.csv',usecols=[1,2,3])

# 문장 분리
df.title=df.title.map(lambda x: [x])
df.text=df.text.map(lambda x: [i.strip() for i in x.split('\n')])
df['data']=df.title+df.text

In [5]:
df.head(5)

Unnamed: 0,title,text,label,data
0,[You Can Smell Hillary’s Fear],"[Daniel Greenfield, a Shillman Journalism Fell...",FAKE,"[You Can Smell Hillary’s Fear, Daniel Greenfie..."
1,[Watch The Exact Moment Paul Ryan Committed Po...,[Google Pinterest Digg Linkedin Reddit Stumble...,FAKE,[Watch The Exact Moment Paul Ryan Committed Po...
2,[Kerry to go to Paris in gesture of sympathy],[U.S. Secretary of State John F. Kerry said Mo...,REAL,"[Kerry to go to Paris in gesture of sympathy, ..."
3,[Bernie supporters on Twitter erupt in anger a...,"[— Kaydee King (@KaydeeKing) November 9, 2016 ...",FAKE,[Bernie supporters on Twitter erupt in anger a...
4,[The Battle of New York: Why This Primary Matt...,[It's primary day in New York and front-runner...,REAL,[The Battle of New York: Why This Primary Matt...


In [28]:
def getGlove(dim):
    # https://nlp.stanford.edu/projects/glove/
    # 6B tokens, 400K voca -> 60억 토큰 사용 & 40만 단어 사전
    df = pd.read_csv('data/glove.6B/glove.6B.%sd.txt' %(dim), sep=" ", quoting=3, header=None, index_col=0)
    glove = {key: val.values for key, val in df.T.items()}
    print("- done. {} tokens".format(len(list(glove.keys())))   )
    # format -> word : vector
    return glove

In [29]:
raw_glove=getGlove(glove_dim)

- done. 399998 tokens


In [30]:
type(raw_glove), len(raw_glove)

(dict, 399998)

In [34]:
list(raw_glove.keys())[0], raw_glove[list(raw_glove.keys())[0]]

(nan, array([ 0.17854 ,  0.6914  ,  0.13973 , -0.069674, -0.3969  ,  0.02561 ,
         0.34453 ,  0.41245 ,  0.53813 , -0.1873  ,  0.69401 , -0.2321  ,
        -0.17245 , -0.090935,  0.5075  ,  0.06169 , -0.53494 ,  0.59271 ,
         0.10355 ,  0.19821 ,  0.080418, -0.33788 , -0.5333  , -0.19901 ,
        -0.078666,  0.18881 , -0.33156 ,  0.14503 ,  0.23971 , -0.21635 ,
         0.20574 ,  0.42454 , -0.41191 , -0.39644 ,  0.23624 , -0.26326 ,
        -0.29633 , -0.22775 , -0.36422 ,  0.064039,  0.43746 ,  0.44278 ,
        -0.026173, -0.059415,  0.15829 , -0.87434 , -0.21649 ,  0.4958  ,
        -0.77992 ,  0.36909 ,  0.25808 ,  0.20311 ,  0.010877, -0.47275 ,
        -0.75945 ,  0.82907 ,  0.59164 , -0.1137  , -0.43274 , -0.67147 ,
        -0.43796 , -0.33914 , -0.83799 , -0.58472 ,  0.64412 , -0.12344 ,
         0.13492 , -0.60226 , -0.34824 ,  0.48464 ,  0.60218 ,  0.29932 ,
         0.14631 , -0.15356 ,  0.24187 , -0.24425 ,  0.34556 ,  0.31079 ,
         0.30814 , -1.0065  , -0.

In [None]:
# model에 사용하는 데이터 처리

In [7]:
def load_vocab(filename):
    d = dict()
    with open(filename, encoding='utf-8') as f:
        for idx, word in enumerate(f):
            word = word.strip()
            d[word] = idx
    # format -> dict[word] = mat index
    return d
def load_embedding_mat(filename):
    with np.load(filename) as data:
        return data["embeddings"]

def importProcessedData():
    global vocabs_path
    global glove_mat_path
    global vocabs_path
    global tf_idf_mat_path
    
    vocabs=load_vocab(vocabs_path)
    #  [ num vocabs x glove_dim ] mat
    glove_mat=load_embedding_mat(glove_mat_path)
    
    tf_idf_vocabs=load_vocab(vocabs_path)
    #  [ num all text x tf_idf_vocabs ] mat
    tf_idf_mat=load_embedding_mat(tf_idf_mat_path)
    
    return vocabs, glove_mat, tf_idf_vocabs, tf_idf_mat

def importTextData():
    global train_text_path
    global test_text_path

    train=pd.read_pickle(train_text_path)
    test=pd.read_pickle(test_text_path)
    
    return train,test

In [6]:
vocabs_path='data/processed_data/vocab.txt'
glove_mat_path='data/processed_data/embedding_mat.npz'
tf_idf_vocabs_path='data/processed_data/tf_idf_vocab.txt'
tf_idf_mat_path='data/processed_data/tf_idf_mat.npz'
train_text_path='data/train_test/train.pkl'
test_text_path='data/train_test/test.pkl'

In [8]:
train_text,test_text = importTextData()
vocabs, glove_mat, tf_idf_vocabs, tf_idf_mat = importProcessedData()
trainX,trainY,testX,testY = train_text.data, train_text.label, test_text.data, test_text.label

In [9]:
train_text.head(3)

Unnamed: 0,data,label
3837,"[[kerry, marks, opening, us, embassy, havana, ...",REAL
3256,"[[clinton, struggles, contain, media, barrage,...",REAL
2029,"[[revealed, several, ku, klux, klan, units, ac...",FAKE


In [11]:
type(vocabs), len(vocabs)

(dict, 54648)

In [14]:
type(glove_mat), glove_mat.shape

(numpy.ndarray, (54648, 100))

In [13]:
type(tf_idf_vocabs), len(tf_idf_vocabs)

(dict, 54648)

In [15]:
type(tf_idf_mat), tf_idf_mat.shape

(numpy.ndarray, (6335, 200))

In [19]:
# tf idf 벡터 train, test로 분리
train_tf_idf_mat=tf_idf_mat[train_text.index]
test_tf_idf_mat=tf_idf_mat[test_text.index]

# label 벡터 생성

In [16]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(trainY.unique()) # label  FAKE , REAL -> 0, 1 변경
trainY=le.transform(trainY.tolist()).reshape([-1,1])
testY=le.transform(testY.tolist()).reshape([-1,1])

oe = preprocessing.OneHotEncoder() # one hot encoding
oe.fit(trainY)
trainY=oe.transform(trainY).toarray()
testY=oe.transform(testY).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [17]:
type(trainY), trainY.shape

(numpy.ndarray, (4434, 2))

In [18]:
trainY[:3]

array([[0., 1.],
       [0., 1.],
       [1., 0.]])

# feature 벡터 생성

In [22]:
glove_dim=100

In [24]:
def sentenceToMeanVect(datasets,vocab,embeddingMat):
    # datasets -> multiple news
    # one news format : [ [ word1, word2 ...    ], [ word1, word2 ... ] ]
    global glove_dim
    new_datasets=np.zeros([len(datasets),glove_dim])
    for news_index,dataset in enumerate(datasets):
        one_news=np.zeros([len(dataset),glove_dim])
        for sentence_idx,words in enumerate(dataset):
            words_index_list=[]
            for w in words:
                if w in vocab.keys():
                    words_index_list.append(vocab[w])
            if len(words_index_list)!=0:
                one_news[sentence_idx]=embeddingMat[words_index_list].mean(axis=0)
        new_datasets[news_index]=one_news.mean(axis=0)
            
    return new_datasets

In [25]:
# data ~ ndarray format
print('sentenceToMeanVect using Glove')
trainX_vect=sentenceToMeanVect(trainX.tolist(),vocabs, glove_mat)
testX_vect=sentenceToMeanVect(testX.tolist(),vocabs, glove_mat)

print('glove vect concat tf_idf')
trainX_vect = np.concatenate( (trainX_vect, train_tf_idf_mat), axis=1)
testX_vect = np.concatenate( (testX_vect, test_tf_idf_mat), axis=1)


sentenceToMeanVect using Glove
glove vect concat tf_idf


In [26]:
type(trainX_vect), trainX_vect.shape

(numpy.ndarray, (4434, 300))

In [27]:
trainX_vect[0]

array([-0.0232491 ,  0.04178675,  0.10682759, -0.0266368 ,  0.05431599,
       -0.02640858, -0.11489148,  0.04868763, -0.01568645,  0.00445718,
       -0.02645362, -0.03948718,  0.12083414,  0.00492541, -0.02978713,
       -0.05722605,  0.03868211, -0.05752463, -0.19206976, -0.04230576,
        0.11625884,  0.02834731,  0.06748906,  0.036395  , -0.00531573,
       -0.04176581, -0.05797076, -0.16345916,  0.09490456, -0.01657301,
       -0.0087927 ,  0.18741142, -0.00189893,  0.06651688, -0.02551905,
        0.10524439,  0.05502425,  0.04827616, -0.10352762,  0.00886553,
       -0.30248996, -0.17512941,  0.12447057, -0.00243119,  0.01236538,
       -0.12427934,  0.0639932 , -0.11708108, -0.05097423, -0.30999676,
        0.09288945, -0.03844827,  0.03593322,  0.42547182, -0.05252739,
       -0.95824803, -0.04663363, -0.08924524,  0.68118263,  0.22637737,
       -0.1981861 ,  0.21309017, -0.05186182, -0.13296328,  0.19974258,
       -0.0051713 ,  0.0854251 ,  0.18398628,  0.04925374, -0.04