<a href="https://colab.research.google.com/github/Ang3lino/mlnn/blob/master/20NewsEmbedded.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from keras.layers import Dropout, Dense, GRU, Embedding
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split


from pprint import pprint

from google.colab import drive
drive.mount('/content/gdrive')

#download file here:
#https://drive.google.com/file/d/1rPG-OC2BHa-TqiFongB4Ts_UuBcDLf6t/view?usp=sharing
#Then put it in your own gdrive and maybe change the folder name.
#But maybe it works, i think we should copy the drive.mount into a new cell


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
def loadData_Tokenizer(X_train, X_test, MAX_NB_WORDS=75000, MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open("gdrive/My Drive/glove.6B.50d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)

In [0]:
def Build_Model_RNN_Text(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    """
    def buildModel_RNN(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    word_index in word index ,
    embeddings_index is embeddings index, look at data_helper.py
    nClasses is number of classes,
    MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
    """
    model = Sequential()
    hidden_layer = 3
    gru_node = 32
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    print(gru_node)
    for i in range(0,hidden_layer):
        model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2))
        model.add(Dropout(dropout))
    model.add(GRU(gru_node, recurrent_dropout=0.2))
    model.add(Dropout(dropout))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(nclasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [0]:
import re
import string

def clean_text(text):
  ''' https://www.youtube.com/watch?v=iQ1bfDMCv_c '''
  '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\w*\d\w*', '', text)

  '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
  text = re.sub('[‘’“”…]', '', text)
  text = re.sub('\n', '', text)
  return text

In [0]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
# newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

# predicted = Build_Model_RNN_Text.predict_classes(X_test_Glove)
# print(metrics.classification_report(y_test, predicted))

In [25]:
print(len(X_test))
for x, y in zip(X_test[:5], y_test[:5]):
  x = X_test[0]
  print(type(x))
  pprint(x)
  pprint(y)

7532
<class 'str'>
('From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)\n'
 'Subject: Need info on 88-89 Bonneville\n'
 'Organization: University at Buffalo\n'
 'Lines: 10\n'
 'News-Software: VAX/VMS VNEWS 1.41\n'
 'Nntp-Posting-Host: ubvmsd.cc.buffalo.edu\n'
 '\n'
 '\n'
 ' I am a little confused on all of the models of the 88-89 bonnevilles.\n'
 'I have heard of the LE SE LSE SSE SSEI. Could someone tell me the\n'
 'differences are far as features or performance. I am also curious to\n'
 'know what the book value is for prefereably the 89 model. And how much\n'
 'less than book value can you usually get them for. In other words how\n'
 'much are they in demand this time of year. I have heard that the mid-spring\n'
 'early summer is the best time to buy.\n'
 '\n'
 '\t\t\tNeil Gandler\n')
7
<class 'str'>
('From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)\n'
 'Subject: Need info on 88-89 Bonneville\n'
 'Organization: University at Buffalo\n'
 'Lines: 10\n'
 'News-Software: VAX/

In [26]:
# newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
# newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
print(type(X_train))

X_train = list(map(clean_text, X_train))
X_test = list(map(clean_text, X_test))

# print(X_train.shape)
# print(X_test.shape)

# X, y = X_test, y_test
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.33, random_state=42)


<class 'list'>


In [27]:
X_train_Glove, X_test_Glove, word_index, embeddings_index = loadData_Tokenizer(X_train,X_test)
model_RNN = Build_Model_RNN_Text(word_index,embeddings_index, 20)
model_RNN.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_test),
                              epochs=20,
                              batch_size=128,
                              verbose=1)

Found 277434 unique tokens.
(18846, 500)
Total 400000 word vectors.
32
Train on 11314 samples, validate on 7532 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ff12b8cc080>

In [0]:
predicted = model_RNN.predict_classes(X_test_Glove)
print(metrics.classification_report(y_test, predicted))