# Preparing the text data

Data is in the `20_newsgroup` floder. You can see the data detail and download it from [here](http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html). 

Glove data we choose the `glove.6B.50d`, which means each word have 40 demensions. You can download it from [here](https://nlp.stanford.edu/projects/glove/), click `glove.6B.zip` to start download. 

The directory structure is like this:
```
word_embedding/
    20_newsgroup/
        alt.atheism/
        comp.windows.x/
        .....
    
    glove.6B/
        glove.6B.50d.txt
        
    keras_word_embedding_tutorial.ipynb    
```

In [98]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

In [2]:
BASE_DIR = '.'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
print(TEXT_DATA_DIR)
print(GLOVE_DIR)

./20_newsgroup
./glove.6B


In [3]:
os.listdir(TEXT_DATA_DIR)

['talk.politics.mideast',
 '.DS_Store',
 'rec.autos',
 'comp.sys.mac.hardware',
 'alt.atheism',
 'rec.sport.baseball',
 'comp.os.ms-windows.misc',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.med',
 'talk.politics.misc',
 'rec.motorcycles',
 'comp.windows.x',
 'comp.graphics',
 'comp.sys.ibm.pc.hardware',
 'sci.electronics',
 'talk.politics.guns',
 'sci.space',
 'soc.religion.christian',
 'misc.forsale',
 'talk.religion.misc']

In [6]:
path1 = os.path.join(TEXT_DATA_DIR, 'talk.politics.mideast')
print(path1)
print(os.path.isdir(path1))

path2 = os.path.join(TEXT_DATA_DIR, '.DS_Store')
print(path2)
print(os.path.isdir(path2))

path3 = os.path.join(TEXT_DATA_DIR, 'comp.windows.x')
print(path3)
print(os.path.isdir(path3))

./20_newsgroup/talk.politics.mideast
True
./20_newsgroup/.DS_Store
False
./20_newsgroup/comp.windows.x
True


In [10]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id

In [11]:
print(labels_index)

{'alt.atheism': 0, 'comp.graphics': 1, 'comp.os.ms-windows.misc': 2, 'comp.sys.ibm.pc.hardware': 3, 'comp.sys.mac.hardware': 4, 'comp.windows.x': 5, 'misc.forsale': 6, 'rec.autos': 7, 'rec.motorcycles': 8, 'rec.sport.baseball': 9, 'rec.sport.hockey': 10, 'sci.crypt': 11, 'sci.electronics': 12, 'sci.med': 13, 'sci.space': 14, 'soc.religion.christian': 15, 'talk.politics.guns': 16, 'talk.politics.mideast': 17, 'talk.politics.misc': 18, 'talk.religion.misc': 19}
[]


In [16]:
# texts = []  # list of text samples
# labels_index = {}  # dictionary mapping label name to numeric id
# labels = []  # list of label ids
# for name in sorted(os.listdir(TEXT_DATA_DIR)):
#     path = os.path.join(TEXT_DATA_DIR, name)
#     if os.path.isdir(path):
#         label_id = len(labels_index)
#         labels_index[name] = label_id
#         for fname in sorted(os.listdir(path)):

print(path1)

print(sorted(os.listdir(path1))) # fname = '75364' or '75365'....

./20_newsgroup/talk.politics.mideast
['75364', '75365', '75366', '75367', '75369', '75370', '75371', '75372', '75373', '75374', '75375', '75376', '75377', '75378', '75379', '75381', '75382', '75383', '75384', '75385', '75386', '75387', '75388', '75389', '75390', '75391', '75392', '75393', '75394', '75395', '75396', '75397', '75398', '75399', '75400', '75401', '75402', '75403', '75404', '75405', '75406', '75407', '75408', '75409', '75410', '75411', '75412', '75413', '75414', '75415', '75416', '75417', '75418', '75419', '75420', '75421', '75422', '75423', '75873', '75874', '75875', '75876', '75877', '75878', '75879', '75880', '75881', '75882', '75883', '75884', '75885', '75886', '75887', '75888', '75889', '75890', '75891', '75892', '75893', '75894', '75895', '75896', '75898', '75899', '75900', '75901', '75902', '75903', '75904', '75905', '75906', '75907', '75908', '75909', '75910', '75911', '75912', '75913', '75914', '75915', '75916', '75917', '75918', '75919', '75920', '75921', '75922',

In [19]:
for i, fname in enumerate(sorted(os.listdir(path1))): # fname = '75364' or '75365'....
    print(type(fname))
    print(fname.isdigit())
    if i > 2:
        break  

# if a string only contain number, like '75364', string.isdigit() will return true

<class 'str'>
True
<class 'str'>
True
<class 'str'>
True
<class 'str'>
True


In [28]:
# texts = []  # list of text samples
# labels_index = {}  # dictionary mapping label name to numeric id
# labels = []  # list of label ids
# for name in sorted(os.listdir(TEXT_DATA_DIR)):
#     path = os.path.join(TEXT_DATA_DIR, name)
#     if os.path.isdir(path):
#         label_id = len(labels_index)
#         labels_index[name] = label_id
#         for fname in sorted(os.listdir(path)):
#             if fname.isdigit():
#                 fpath = os.path.join(path, fname) # fpath = './20_newsgroup/talk.politics.mideast/75364'
#                 if sys.version_info < (3,):
#                     f = open(fpath)
#                 else:
#                     f = open(fpath, encoding='latin-1')
#                 t = f.read() # see the output 
#                 i = t.find('\n\n') # skip header
#                 if 0 < i:
#                     t = t[i:] # see the output 

f = open('./20_newsgroup/talk.politics.mideast/75364', encoding='latin-1')
t = f.read()
print(t)
print('raw text: ')
print(repr(t))
i = t.find('\n\n')
if 0 < i:
    t = t[i:]
print('\n skip head: ')

print(t)
f.close()

Newsgroups: talk.politics.mideast
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!wupost!uunet!enterpoop.mit.edu!thunder.mcrcim.mcgill.edu!hasan
From: hasan@McRCIM.McGill.EDU
Subject: Re: ISLAM BORDERS. ( was :Israel: misisipi to ganges)
Message-ID: <1993Apr5.183555.20163@thunder.mcrcim.mcgill.edu>
Originator: hasan@lightning.mcrcim.mcgill.edu
Sender: hasan@McRCIM.McGill.EDU (M. Hasan AlHafez)
Nntp-Posting-Host: lightning.mcrcim.mcgill.edu
Organization: McGill Research Centre for  Intelligent Machines
References: <C4to4G.LnG@news.cso.uiuc.edu> <1993Apr2.155220.16185@thunder.mcrcim.mcgill.edu> <2BBC9B34.13517@news.service.uci.edu> <4805@bimacs.BITNET>
Date: Mon, 5 Apr 93 18:35:55 GMT
Lines: 26


In article <4805@bimacs.BITNET>, ehrlich@bimacs.BITNET (Gideon Ehrlich) writes:
|> 
|> Hassan and some other seemed not to be a ware that Jews celebrating on
|> these days Thje Passover holliday the holidy of going a way from 

In [30]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

In [34]:
print(len(texts))
print(texts) # too large to print 

19997


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [87]:
texts[0]

'\n\nArchive-name: atheism/resources\nAlt-atheism-archive-name: resources\nLast-modified: 11 December 1992\nVersion: 1.0\n\n                              Atheist Resources\n\n                      Addresses of Atheist Organizations\n\n                                     USA\n\nFREEDOM FROM RELIGION FOUNDATION\n\nDarwin fish bumper stickers and assorted other atheist paraphernalia are\navailable from the Freedom From Religion Foundation in the US.\n\nWrite to:  FFRF, P.O. Box 750, Madison, WI 53701.\nTelephone: (608) 256-8900\n\nEVOLUTION DESIGNS\n\nEvolution Designs sell the "Darwin fish".  It\'s a fish symbol, like the ones\nChristians stick on their cars, but with feet and the word "Darwin" written\ninside.  The deluxe moulded 3D plastic fish is $4.95 postpaid in the US.\n\nWrite to:  Evolution Designs, 7119 Laurel Canyon #4, North Hollywood,\n           CA 91605.\n\nPeople in the San Francisco Bay area can get Darwin Fish from Lynn Gold --\ntry mailing <figmo@netcom.com>.  For net 

In [42]:
print(len(labels))
print(labels[:10])

19997
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [40]:
print(set(labels))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}


In [41]:
labels_index

{'alt.atheism': 0,
 'comp.graphics': 1,
 'comp.os.ms-windows.misc': 2,
 'comp.sys.ibm.pc.hardware': 3,
 'comp.sys.mac.hardware': 4,
 'comp.windows.x': 5,
 'misc.forsale': 6,
 'rec.autos': 7,
 'rec.motorcycles': 8,
 'rec.sport.baseball': 9,
 'rec.sport.hockey': 10,
 'sci.crypt': 11,
 'sci.electronics': 12,
 'sci.med': 13,
 'sci.space': 14,
 'soc.religion.christian': 15,
 'talk.politics.guns': 16,
 'talk.politics.mideast': 17,
 'talk.politics.misc': 18,
 'talk.religion.misc': 19}

In [None]:
# write the load data in one cell
# No need to run this cell

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

**info**
- `texts`: contain 19997 text
- `labels`: the label of each text
- `labels_index`: the category of each label

## preprocess with keras

In [74]:
BASE_DIR = '.'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000 # truncate the sequences to a maximum length of 1000 words
MAX_NUM_WORDS = 20000 # we only consider top 20000 most commonly occuring words in the dataset
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
print(TEXT_DATA_DIR)
print(GLOVE_DIR)

./20_newsgroup
./glove.6B


convert all text samples in the dataset into sequences of word indices. A "word index" would simply be an integer ID for the word. We will only consider the top 20,000 most commonly occuring words in the dataset, and we will truncate the sequences to a maximum length of 1000 words.


In [75]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [85]:
tokenizer = Tokenizer(num_words=20000)  # MAX_NUM_WORDS = 20000 
tokenizer.fit_on_texts(texts)


In [86]:
len(tokenizer.word_index)

174074

In [77]:
sequences = tokenizer.texts_to_sequences(texts)

In [78]:
print(len(sequences))
print(len(sequences[0]))
print(sequences[0])


19997
1528
[1237, 273, 1213, 1439, 1071, 1213, 1237, 273, 1439, 192, 2515, 348, 2964, 779, 332, 28, 45, 1628, 1439, 2516, 3, 1628, 2144, 780, 937, 29, 441, 2770, 8854, 4601, 7969, 11979, 5, 12806, 75, 1628, 19, 229, 29, 1, 937, 29, 441, 2770, 6, 1, 118, 558, 2, 90, 106, 482, 3979, 6602, 5375, 1871, 12260, 1632, 17687, 1828, 5101, 1828, 5101, 788, 1, 8854, 4601, 96, 4, 4601, 5455, 64, 1, 751, 563, 1716, 15, 71, 844, 24, 20, 1971, 5, 1, 389, 8854, 744, 1023, 1, 7762, 1300, 2912, 4601, 8, 73, 1698, 6, 1, 118, 558, 2, 1828, 5101, 16500, 13447, 73, 1261, 10982, 170, 66, 6, 1, 869, 2235, 2544, 534, 34, 79, 8854, 4601, 29, 6603, 3388, 264, 1505, 535, 49, 12, 343, 66, 60, 155, 2, 6603, 1043, 1, 427, 8, 73, 1698, 618, 4601, 417, 1628, 632, 11716, 4602, 814, 1628, 691, 3, 1, 467, 2163, 3, 2266, 7491, 5, 48, 15, 40, 135, 378, 8, 1, 467, 6359, 30, 101, 90, 1781, 5, 115, 101, 417, 1628, 632, 17061, 1448, 4317, 45, 860, 73, 1611, 2455, 3343, 467, 7491, 13132, 5814, 1301, 1781, 1, 467, 9477, 667, 117

In [79]:
word_index = tokenizer.word_index

In [80]:
print(len(word_index))
for i, item in enumerate(word_index.items()):
    print(item)
    if i > 10:
        break

# 0 is a reserved index that won't be assigned to any word.

174074
('the', 1)
('to', 2)
('of', 3)
('a', 4)
('and', 5)
('in', 6)
('i', 7)
('is', 8)
('that', 9)
("'ax", 10)
('it', 11)
('for', 12)


In [53]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # MAX_SEQUENCE_LENGTH = 1000

In [55]:
print(data.shape)
print(data[0])

(19997, 1000)
[   58   576     3   137    51  5922 13449  5870    85     1   347    19
     3    58  7855 10348  6676     4  5923  5871     4   294     3   544
     5   714     2     4  1585  2001     2  2409     4  6308    29 11478
     1    67     1  5871  4087     2  2310   577    29     1  6676  1144
     8  2786     2 12525     4  8856     5  8587  8205     4     3   571
 17063    12   149  1758     3     4   547   323   441     1  9182  6677
    12     1  3749 10008     3   972   154  1172     8  2602   119    85
  1725    30     4  5456  2738  5338     3  2678    24  1370  3077  2478
    51     8 10983    30    85 15439     5  1628   916     5 12806    75
  3329  3158     1  3077  4271   137   544    30   531     4   886  1623
  6977    21   280  4498    29   209  1786   140  1140   280     8  4741
     5   200    16  9649    30     4   628   313  1106  1725     8  8207
     2   408  2200   715  3406  2121 19948     1  8324     4   889   323
    15     1  4431     9     1   118 

In [56]:
labels = to_categorical(np.asarray(labels))
print(labels.shape)

(19997, 20)


In [59]:
# shuffle and split to train and validation data
indices = np.arange(data.shape[0])
print(indices[:10])
np.random.shuffle(indices)
print(indices[:10])
data = data[indices]
labels = labels[indices]

nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_test = data[-nb_validation_samples:]
y_test = labels[-nb_validation_samples:]

[0 1 2 3 4 5 6 7 8 9]
[ 9616 18394 12658   218  5190 14675 19822 18534  6858  1749]


In [None]:
# write in one cell 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

## Preparing the Embedding layer

Mapping the 20000 words to the known embedding in the glove.

In [67]:
with open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt')) as f:
    i = 0
    for line in f.readlines():
        print(line)
        print(line.split())
        i += 1
        if i > 1:
            break

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581

['the', '0.418', '0.24968', '-0.41242', '0.1217', '0.34527', '-0.044457', '-0.49688', '-0.17862', '-0.00066023', '-0.6566', '0.27843', '-0.14767', '-0.55677', '0.14658', '-0.0095095', '0.011658', '0.10204', '-0.12792', '-0.8443', '-0.12181', '-0.016801', '-0.33279', '-0.1552', '-0.23131', '-0.19181', '-1.8823', '-0.76746', '0.099051', '-0.42125', '-0.19526', '4.0071', '-0.18594', '-0.52287', '-0.31681', '0.00059213', '0.0074449', '0.17778', '-0.15897', '0.012041', '-0.054223', '-0.29871', '-0.15749', '-0.34758', '-0.045637', '-0.44251', '0.1878

In [69]:
# read glove to embedding
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [70]:
print(len(embeddings_index)) # num of words 

400000


In [90]:
EMBEDDING_DIM = 50
print(len(word_index))

174074


In [92]:
# use embeddings_index to build our embedding_matrix
embedding_matrix = np.zeros((len(word_index)+1, EMBEDDING_DIM)) # fist row represent 0, unknown
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word) # if not find in the dict, return None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [93]:
embedding_matrix.shape

(174075, 50)

In [94]:
len(word_index)

174074

In [96]:
from keras.layers import Embedding
embedding_layer = Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


## Training a 1D convnet

In [100]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_test, y_test),
          epochs=2, batch_size=128)

Train on 15998 samples, validate on 3999 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1837fe6c50>