In [2]:
import keras
from tensorflow.python.client import device_lib

import numpy as np
import pandas as pd
from collections import defaultdict
import re
import sys
import os

import seaborn as sns

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

from keras.models import Sequential
from keras.layers.convolutional import Conv3D
from keras.layers.convolutional_recurrent import ConvLSTM2D
from keras.layers.normalization import BatchNormalization
from matplotlib import pyplot as plt
from keras.layers import LSTM, GRU

from sklearn.model_selection import train_test_split

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 50

def clean_str(string):
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

data_train = pd.read_csv('snopes.csv')

# Input Data preprocessing
list_labels = list(set(data_train.claim_label))
texts = []
labels = []

for i in range(data_train.topic.shape[0]):
    text1 = str(data_train.topic[i])
    text = str(data_train.claim[i])
    texts.append(text+text1)
    labels.append(list_labels.index(data_train.claim_label[i]))
    
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Pad input sequences
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels),num_classes = len(list_labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Train test validation Split
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train, x_test, y_train, y_test = train_test_split( data, labels, test_size=0.20, random_state=42)
print('Size of train, test:', len(y_train), len(y_test))

print(list_labels, 'in train, test :')
print(y_train.sum(axis=0))
print(y_test.sum(axis=0))

#Using Pre-trained word embeddings
embeddings_index = {}
with open('glove.6B.50d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Total %s word vectors in Glove.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH)

embedding_vecor_length = 32
modell = Sequential()
modell.add(embedding_layer)
modell.add(Dropout(0.2))
modell.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu'))
modell.add(MaxPooling1D(pool_size=2))
modell.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
modell.add(MaxPooling1D(pool_size=2))
modell.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
modell.add(BatchNormalization())
modell.add(Dense(256, activation='relu'))
modell.add(Dense(128, activation='relu'))
modell.add(Dense(64, activation='relu'))
modell.add(Dense(32, activation='relu'))
modell.add(Dense(len(list_labels), activation='softmax'))
modell.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(modell.summary())
modell.fit(x_train, y_train, epochs=5, batch_size=128)

# Test model
test_preds = modell.predict(x_test)
preds = []
actual = []
correct_predictions = 0
for i in range(len(test_preds)):
    x = np.argmax(test_preds[i])
    y = np.argmax(y_test[i])
    preds.append(x)
    actual.append(y)
    if x==y:
        correct_predictions+=1
print("Correct predictions:", correct_predictions)
print("Total number of test examples:", len(y_test))
print("Accuracy of model: ", correct_predictions/float(len(y_test)))

Found 9766 unique tokens.
Shape of data tensor: (16865, 1000)
Shape of label tensor: (16865, 7)
Size of train, test: 13492 3373
['mixture', 'true', 'mfalse', 'false', 'mtrue', 'undetermined', 'legend'] in train, test :
[2101. 1144. 2667. 7027.  393.  126.   34.]
[ 517.  336.  652. 1738.   94.   25.   11.]
Total 400000 word vectors in Glove.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 50)          488350    
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000, 50)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1000, 32)          8032      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 500, 32)           0         
_______________________________________________________________

In [3]:
print(preds.count(0), preds.count(1), preds.count(2), preds.count(3), preds.count(4), preds.count(5), preds.count(6))

493 432 706 1665 77 0 0


In [4]:
print(actual.count(0), actual.count(1), actual.count(2), actual.count(3), actual.count(4), actual.count(5), actual.count(6))

517 336 652 1738 94 25 11
