In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,  GlobalMaxPooling1D, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Activation, Flatten
from keras.models import Model
from keras.initializers import Constant
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

word_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Загрузка данных

In [14]:
real_news_df = pd.read_csv('drive/MyDrive/dataset/True.csv')
fake_news_df = pd.read_csv('drive/MyDrive/dataset/Fake.csv')

In [15]:
real_news_df = real_news_df[real_news_df['text'].str.len() >= 3]
fake_news_df = fake_news_df[fake_news_df['text'].str.len() >=3]
real_news_df['real_fact'] = 1
fake_news_df['real_fact'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_news_df['real_fact'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_news_df['real_fact'] = 0


Очистка данных.



*   Функция decontracted используется для расширения сокращенных слов в данной текстовой фразе.
*  Функция get_cleaned_data принимает входные данные (текстовые данные) и выполняет над ними несколько шагов очистки данных.




In [16]:
def decontracted(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def get_cleaned_data(input_data, mode='df'):
    stop = stopwords.words('english')
    input_df = ''
    if mode != 'df':
        input_df = pd.DataFrame([input_data], columns=['text'])
    else:
        input_df = input_data
    input_df['text'] = input_df['text'].str.lower()
    input_df['text'] = input_df['text'].apply(lambda elem: decontracted(elem))
    input_df['text'] = input_df['text'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    input_df['text'] = input_df['text'].apply(lambda elem: re.sub(r"\d+", "", elem))
    input_df['text'] = input_df['text'].apply(lambda x: ' '.join([word.strip() for word in x.split() if word not in (stop)]))
    input_df['text'] = input_df['text'].apply(lambda words: (wordnet_lemmatizer.lemmatize(words)))
    return input_df

fake_news_df = get_cleaned_data(fake_news_df)
real_news_df = get_cleaned_data(real_news_df)
#объединение очищенные данные в один news_data_df.
news_data_df = pd.concat([real_news_df, fake_news_df], ignore_index = True)
print(news_data_df.shape)

(44267, 5)


Разделение данных на обучающий и тестовый наборы с использованием функции train_test_split

In [17]:
MAX_SEQUENCE_LENGTH = 500  # Максимальная длина последовательности
MAX_NUM_WORDS = 10000  # Максимальное количество слов
EMBEDDING_DIM = 300  # Размерность вектора слов
VALIDATION_SPLIT = 0.3  # Доля данных для валидации

x_train,x_test,y_train,y_test = train_test_split(news_data_df.text,news_data_df.real_fact,random_state = 42, test_size=VALIDATION_SPLIT, shuffle=True)

Векторизовать образцы текста в двумерный целочисленный тензор

In [18]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

tokenizer.fit_on_texts(x_train)
tokenized_train = tokenizer.texts_to_sequences(x_train)
X_train = pad_sequences(tokenized_train, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found {} unique tokens. and {} lines '.format(len(word_index), len(X_train)))

Found 169780 unique tokens. and 30986 lines 


In [19]:
tokenized_test = tokenizer.texts_to_sequences(x_test)
X_test = pad_sequences(tokenized_test, maxlen=MAX_SEQUENCE_LENGTH)

Получите предварительно обученный индекс внедрения из GoogleNews-vectors-negative300:

Векторы GoogleNews располагаются в порядке от наиболее частого к наименее частому, поэтому первые N обычно представляют собой подмножество размера N.
Поэтому используйте limit=500000, чтобы получить наиболее часто встречающиеся векторы из 500 000 слов, экономя 5/6 памяти/времени загрузки.

In [20]:
from gensim.models.keyedvectors import KeyedVectors
def get_embeddings(path):
  wv_from_bin = KeyedVectors.load_word2vec_format(path, binary=True, limit=500000)
  embeddings_index = {}
  for word, vector in zip(wv_from_bin.key_to_index, wv_from_bin.vectors):
      coefs = np.asarray(vector, dtype='float32')
      embeddings_index[word] = coefs
  return embeddings_index

embeddings_index = {}
embeddings_index = get_embeddings('drive/MyDrive/dataset/GoogleNews-vectors-negative300.bin')
print('Found %s word vectors.' % len(embeddings_index))



Found 500000 word vectors.


Создания матрицы встраивания для модели нейронной сети.

In [21]:
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    try:
        embedding_vector = embeddings_index[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

In [22]:
del embeddings_index

Подготовьте модель CNN с помощью GlobalMaxPooling для классификации.

In [23]:
def cnn_net1():
    model = Sequential()

    #Non-trainable embeddidng layer
    model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))

    model.add(Dropout(0.2))
    model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.2))
    model.add(Dense(units = 250 , activation = 'relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model


Подготовьте модель LSTM

In [24]:
def lstm_net1():
    model = Sequential()

    #Non-trainable embeddidng layer
    model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))

    model.add(LSTM(units=128 , return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=64))
    model.add(Dropout(0.1))
    model.add(Dense(units = 32 , activation = 'relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

Устанавливаем batch_size = 256,epochs = 8


In [26]:
model = cnn_net1()

batch_size = 256
epochs = 8

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 300)          50934300  
                                                                 
 dropout (Dropout)           (None, 500, 300)          0         
                                                                 
 conv1d (Conv1D)             (None, 497, 128)          153728    
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 250)               32250     
                                                        

In [29]:
history = model.fit(X_train, y_train, batch_size = batch_size , validation_data = (X_test,y_test) , epochs = epochs)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


Оценка производительности на обучающем и тестовом наборах данных.

In [30]:
accr_train = model.evaluate(X_train,y_train)
print('Accuracy Train: {}'.format(accr_train[1]*100))
accr_test = model.evaluate(X_test,y_test)
print('Accuracy Test: {}'.format(accr_test[1]*100))

Accuracy Train: 99.99354481697083
Accuracy Test: 99.69881772994995


In [25]:
model = lstm_net1()

batch_size = 256
epochs = 8

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 300)          50934000  
                                                                 
 lstm (LSTM)                 (None, 500, 128)          219648    
                                                                 
 dropout (Dropout)           (None, 500, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 1)                 3

In [26]:
history = model.fit(X_train, y_train, batch_size = batch_size , validation_data = (X_test,y_test) , epochs = epochs)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


Оценка производительности на обучающем и тестовом наборах данных.

In [27]:
accr_train = model.evaluate(X_train,y_train)
print('Accuracy Train: {}'.format(accr_train[1]*100))
accr_test = model.evaluate(X_test,y_test)
print('Accuracy Test: {}'.format(accr_test[1]*100))

Accuracy Train: 99.87736344337463
Accuracy Test: 99.68376159667969


Все модели с предварительно обученным Word2Vec от GoogleNewsVectors.

*   с GlobalMaxpool 99%
*   LSTM: 99%


Использование Google News Vectors в качестве входных данных для модели CNN и LSTM позволяет улучшить качество классификации фейковых новостей за счет более точного представления слов и их значений. Кроме того, комбинация CNN и LSTM позволяет модели извлекать как локальные, так и глобальные признаки из текста, что способствует лучшему пониманию контекста и выявлению признаков, характеризующих фейковые новости.

