<a href="https://colab.research.google.com/github/manishiitg/ML_Experments/blob/master/nlp/spacy_vectors_with_keras_deep_nets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Trying out spacy word embedding for text classification

Run this with GPU else it takes too long to run

https://stackoverflow.com/a/56949134  make sure to do this to fix spacy error

In [1]:
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
#https://nlpforhackers.io/keras-intro/

%matplotlib inline

import pandas as pd
import numpy as np

import spacy 
nlp = spacy.load("en_core_web_md")

from sklearn.datasets import fetch_20newsgroups


import matplotlib.pyplot as plt;
from itertools import cycle;
import matplotlib.pyplot as plt;
from sklearn.model_selection import train_test_split

from collections import defaultdict



In [0]:
def normalize(comment, lowercase=True, remove_stopwords=True):
    if lowercase:
        comment = comment.lower()
    lines = comment.splitlines()
    lines = [x.strip(' ') for x in lines]
    lines = [x.replace('"', '') for x in lines]
    lines = [x.replace('\\"', '') for x in lines]
    lines = [x.replace(u'\xa0', u'') for x in lines]
    comment = " ".join(lines)
    doc = nlp(comment)

    # for token in doc:
    #   print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
    #     token.shape_, token.is_alpha, token.is_stop)

    words = [token for token in doc if token.is_stop !=
             True and token.is_punct != True]
    # return " ".join(words)
    lemmatized = list()
    for word in words:
        lemma = word.lemma_.strip()
        if lemma:
            lemmatized.append(lemma)
    return lemmatized

In [4]:
import os

def writetofile(dir, filename, data):
    if not os.path.exists(dir):
        os.makedirs(dir)
    f = os.path.join(dir,str(filename))

    with open(f, 'wb') as the_file:
      the_file.write(data)


categories = [ 'comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','comp.windows.x']
news = fetch_20newsgroups(subset="train", categories=categories)


clean_data  = []

# print(news.keys())

# print(news["filenames"][:10])

# print(len(news["data"][:1000]))

# print(news["target_names"][:50])
# print(news["target"][:10])


max_limit = 4000

targets = news["target"][:max_limit]
filenames = news["filenames"][:max_limit]
news = news["data"][:max_limit]

dir = "news_group_cleaned"

total_words = 0

print("cleaning data")
for i, row in enumerate(news):
  filename = filenames[i]
  filename = filename[(filename.rfind('/'))+1:]
  if os.path.exists(filename):
    with open(os.path.join(dir, filename), 'r') as content_file:
      data = content_file.read()
      cleaned = data.split(" ")
      total_words += len(cleaned)
  else:
    cleaned = normalize(row)
    cleaned = " ".join(cleaned)
    writetofile(dir, filename, cleaned.encode("utf-8"))
    total_words += len(cleaned)
    
  
  clean_data.append(cleaned)

print("data cleaned")

print(clean_data[1000])

print('total words' , total_words)



Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


cleaning data
data cleaned
rcaldrn@med.miami.edu richard calderon subject move icon nntp post host epistat.med.miami.edu organization university miami medical school line 17 kmembry@viamar.uucp kirk membry write > remember read program window icon run away > mouse move near know > program ftp location probably cica remember program look call icofrite cica see ago richard calderon rcaldrn@epi.med.miami.edu university miami school medicine information system compute 1029 nw 15 st miami florida 33136
total words 3804416


In [0]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# tfid_vectorizer = TfidfVectorizer(max_features=20000)

# tfidf = tfid_vectorizer.fit_transform(clean_data)

# feature_names = tfid_vectorizer.get_feature_names()

# # print(tfid_vectorizer.)

# print(len(feature_names))

# print(feature_names[2000:3000])

# # https://stackoverflow.com/a/35615151

# # this is mainly used to extra the max feature nothing else. we can also use count vectorzer for this 
# # we are not going to use the weights 

In [6]:
#prepare text

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2


tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_data)
sequences = tokenizer.texts_to_sequences(clean_data)

# sequences2 = tokenizer.text_to_word_sequence(clean_data) 

word_index = tokenizer.word_index


print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# print(clean_data[0])
# print(data[0])

# print(word_index)

labels = to_categorical(np.asarray(targets))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

Using TensorFlow backend.


Found 64456 unique tokens.
Shape of data tensor: (2936, 1000)
Shape of label tensor: (2936, 5)


In [7]:
EMBEDDINGS_LEN = len(nlp.vocab['apple'].vector)
print(EMBEDDINGS_LEN)


embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDINGS_LEN))
for word, idx in word_index.items():
    try:
        embedding = nlp.vocab[word].vector
        embedding_matrix[idx] = embedding
    except:
      pass

300


In [8]:


from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten

model = Sequential()

model.add(Embedding(len(word_index) + 1,
                            EMBEDDINGS_LEN,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=len(categories), activation='softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
model.summary()


model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=12, batch_size=128,shuffle=True)

# scores = model.evaluate(X_test, Y_test, verbose=1)

# print("Accuracy: ", scores[1])
# print("Loss: ", scores[0])














Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 300)         19337100  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4992)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)  

<keras.callbacks.History at 0x7faf6b39add8>

In [9]:
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten
from keras.layers import LSTM


model = Sequential()

model.add(Embedding(len(word_index) + 1,
                            EMBEDDINGS_LEN,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=len(categories), activation='softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
model.summary()


model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=12, batch_size=32,shuffle=True)


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 300)         19337100  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 996, 128)          192128    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 645       
Tota

<keras.callbacks.History at 0x7faf16f02400>