In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Experiments on Newsgroup20 dataset
---
> Word Embedding
* Glove
* Word2Vec
* training from scratch

> Model
* Uni-LSTM
* Uni-GRU
* Bi-LSTM
* Bi-GRU
* 1D convnet

## 1. Data Exploring & Preparing
* data_dir에는 20개의 뉴스 그룹 하위 폴더가 있음
* 각각의 하위폴더에는 1000개씩의 text파일이 있음

In [2]:
import os

os.chdir("C:/Users/user/study/HandsOnDL/sub_materials/dataset")

In [3]:
data_path = keras.utils.get_file(
    "news20.tar.gz",
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
    untar=True,
)

import pathlib

data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
dirnames = os.listdir(data_dir)
print("Number of directories:", len(dirnames))
print("Directory names:", dirnames)

fnames = os.listdir(data_dir / "comp.graphics")
print("Number of files in comp.graphics:", len(fnames))
print("Some example filenames:", fnames[:5])

Downloading data from http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz
Number of directories: 20
Directory names: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Number of files in comp.graphics: 1000
Some example filenames: ['37261', '37913', '37914', '37915', '37916']


In [4]:
print(open(data_dir / "comp.graphics" / "38987").read())

Newsgroups: comp.graphics
Path: cantaloupe.srv.cs.cmu.edu!das-news.harvard.edu!noc.near.net!howland.reston.ans.net!agate!dog.ee.lbl.gov!network.ucsd.edu!usc!rpi!nason110.its.rpi.edu!mabusj
From: mabusj@nason110.its.rpi.edu (Jasen M. Mabus)
Subject: Looking for Brain in CAD
Message-ID: <c285m+p@rpi.edu>
Nntp-Posting-Host: nason110.its.rpi.edu
Reply-To: mabusj@rpi.edu
Organization: Rensselaer Polytechnic Institute, Troy, NY.
Date: Thu, 29 Apr 1993 23:27:20 GMT
Lines: 7

Jasen Mabus
RPI student

	I am looking for a hman brain in any CAD (.dxf,.cad,.iges,.cgm,etc.) or picture (.gif,.jpg,.ras,etc.) format for an animation demonstration. If any has or knows of a location please reply by e-mail to mabusj@rpi.edu.

Thank you in advance,
Jasen Mabus  



>article 본문 외에 부가적인 정보들을 포함하는 header 부분이 있음.<br/>전처리를 통해 날려줄 것.

In [5]:
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):   # subject별 디렉토리 순회
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:                       # 개별 subject 내부 파일 순회
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")            # 줄별로 분리
        lines = lines[10:]                     # header 날리기
        content = "\n".join(lines)             # 다시 분리된 line들을 다시 합치기
        samples.append(content)                # 전처리한 text를 sample목록에 추가
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))

Processing alt.atheism, 1000 files found
Processing comp.graphics, 1000 files found
Processing comp.os.ms-windows.misc, 1000 files found
Processing comp.sys.ibm.pc.hardware, 1000 files found
Processing comp.sys.mac.hardware, 1000 files found
Processing comp.windows.x, 1000 files found
Processing misc.forsale, 1000 files found
Processing rec.autos, 1000 files found
Processing rec.motorcycles, 1000 files found
Processing rec.sport.baseball, 1000 files found
Processing rec.sport.hockey, 1000 files found
Processing sci.crypt, 1000 files found
Processing sci.electronics, 1000 files found
Processing sci.med, 1000 files found
Processing sci.space, 1000 files found
Processing soc.religion.christian, 997 files found
Processing talk.politics.guns, 1000 files found
Processing talk.politics.mideast, 1000 files found
Processing talk.politics.misc, 1000 files found
Processing talk.religion.misc, 1000 files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.ha

In [6]:
# # data shuffle
# seed = 2021
# rng = np.random.RandomState(seed)
# rng.shuffle(samples)
# rng = np.random.RandomState(seed)
# rng.shuffle(labels)

# # train / valid split
# validation_split = 0.2
# num_validation_samples = int(validation_split * len(samples))
# train_samples = samples[:-num_validation_samples]
# val_samples = samples[-num_validation_samples:]
# train_labels = labels[:-num_validation_samples]
# val_labels = labels[-num_validation_samples:]

from sklearn.model_selection import train_test_split

train_samples, train_labels, val_samples, val_labels =\
    train_test_split(samples, labels, test_size = 0.2, random_state = 2021)

In [22]:
print(tf.__version__)

2.1.0


## 2. Create a Vocabulary Index

* TextVectorization : batch의 각 sample string을 token indices의 list로 변환한다. <br/>

구체적인 과정은 다음과 같다.

1. standardize each sample (usually lowercasing + punctuation stripping)
2. split each sample into substrings (usually words)
3. recombine substrings into tokens (usually ngrams)
4. index tokens (associate a unique int value with each token)
5. transform each sample using this index, either into a vector of ints or a dense float vector.

In [7]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# 빈도순으로 20000개의 단어만 고려할 것이며, output되는 시퀀스의 길이는 200으로 할 것임
vectorizer = TextVectorization(max_tokens=20000, 
                               output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [8]:
# top 10 words
vectorizer.get_vocabulary()[:10]

[b'the', b'to', b'of', b'a', b'and', b'in', b'is', b'i', b'that', b'it']

In [9]:
# vectorization example
output = vectorizer([["the cat sat on the mat"], 
                     ["i want to study only deep learning all day long."]])
output.numpy()

array([[   2, 3830, 1678,   15,    2, 6367,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [10]:
# get dictionary mapping words to indices
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [21]:
voc

[b'the',
 b'to',
 b'of',
 b'a',
 b'and',
 b'in',
 b'is',
 b'i',
 b'that',
 b'it',
 b'for',
 b'you',
 b'this',
 b'on',
 b'be',
 b'not',
 b'are',
 b'have',
 b'with',
 b'as',
 b'or',
 b'if',
 b'was',
 b'but',
 b'they',
 b'from',
 b'by',
 b'at',
 b'an',
 b'my',
 b'can',
 b'what',
 b'all',
 b'would',
 b'there',
 b'one',
 b'will',
 b'do',
 b'about',
 b'writes',
 b'we',
 b'so',
 b'he',
 b'has',
 b'your',
 b'no',
 b'article',
 b'any',
 b'me',
 b'some',
 b'who',
 b'were',
 b'which',
 b'its',
 b'out',
 b'dont',
 b'when',
 b'people',
 b'like',
 b'just',
 b'more',
 b'their',
 b'know',
 b'1',
 b'other',
 b'them',
 b'up',
 b'only',
 b'get',
 b'had',
 b'how',
 b'than',
 b'been',
 b'think',
 b'his',
 b'lines',
 b'also',
 b'2',
 b'x',
 b'then',
 b'does',
 b'use',
 b'time',
 b'im',
 b'these',
 b'should',
 b'could',
 b'well',
 b'new',
 b'us',
 b'good',
 b'may',
 b'because',
 b'even',
 b'now',
 b'am',
 b'very',
 b'see',
 b'into',
 b'those',
 b'why',
 b'0',
 b'way',
 b'much',
 b'make',
 b'many',
 b'first',

In [16]:
test = ["the", "cat", "sat", "on", "the", "mat"]


KeyError: 'the'

## 3. Load Pre-trained Word Embeddings

In [None]:
os.chdir("C:/Users/user/study/HandsOnDL/sub_materials/glove")

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), ".keras/datasets/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))