In [1]:
# write in train.py
import sys, os
sys.path.append(os.pardir)

import tensorflow as tf
import json
import argparse

from data_utils import Data
from models.char_cnn_zhang import CharCNNZhang
from models.char_cnn_kim import CharCNNKim

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='char_cnn_zhang', help='Specifies which model to use: char_cnn_zhang or char_cnn_kim')
FLAGS = parser.parse_args(["--model", "char_cnn_zhang"])

Using TensorFlow backend.


In [3]:
print(FLAGS)
print(FLAGS.model)

Namespace(model='char_cnn_zhang')
char_cnn_zhang


In [4]:
# Load configurations
config = json.load(open('../config.json'))

for key, value in config.items():
    print(key, value)
    print()

notes default

data {'alphabet': 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{}', 'alphabet_size': 69, 'input_size': 1014, 'num_of_classes': 4, 'training_data_source': 'data/ag_news_csv/train.csv', 'validation_data_source': 'data/ag_news_csv/test.csv'}

model char_cnn_zhang

training {'epochs': 5000, 'batch_size': 128, 'evaluate_every': 100, 'checkpoint_every': 100}

char_cnn_zhang {'embedding_size': 128, 'conv_layers': [[256, 7, 3], [256, 7, 3], [256, 3, -1], [256, 3, -1], [256, 3, -1], [256, 3, 3]], 'fully_connected_layers': [1024, 1024], 'threshold': 1e-06, 'dropout_p': 0.5, 'optimizer': 'adam', 'loss': 'categorical_crossentropy'}

char_cnn_kim {'embedding_size': 128, 'conv_layers': [[256, 10], [256, 7], [256, 5], [256, 3]], 'fully_connected_layers': [1024, 1024], 'threshold': 1e-06, 'dropout_p': 0.1, 'optimizer': 'adam', 'loss': 'categorical_crossentropy'}



In [5]:
# See the data 
config['data']

{'alphabet': 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{}',
 'alphabet_size': 69,
 'input_size': 1014,
 'num_of_classes': 4,
 'training_data_source': 'data/ag_news_csv/train.csv',
 'validation_data_source': 'data/ag_news_csv/test.csv'}

In [6]:
# Set the data path in order to run in the notebook 
config['data']["training_data_source"] = '../data/ag_news_csv/train.csv'
config['data']["validation_data_source"] = '../data/ag_news_csv/test.csv'


In [7]:
data_source = '../data/ag_news_csv/train.csv'

import pandas as pd
train_df = pd.read_csv(data_source, header=None)
train_df.head()

Unnamed: 0,0,1,2
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [9]:
train_df[1] = train_df[1] + train_df[2]
train_df = train_df.drop([2], axis=1)
train_df.head()

Unnamed: 0,0,1
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


In [13]:
train_df[1][0]

"Wall St. Bears Claw Back Into the Black (Reuters)Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [18]:
labels_list = train_df[0]
print(len(labels_list))
print(labels_list[:10])

120000
0    3
1    3
2    3
3    3
4    3
5    3
6    3
7    3
8    3
9    3
Name: 0, dtype: int64


### train data to index

In [31]:
texts = train_df[1].values
print(len(texts))
print(texts[:2])

120000
["Wall St. Bears Claw Back Into the Black (Reuters)Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."
 'Carlyle Looks Toward Commercial Aerospace (Reuters)Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.']


In [30]:
# texts contrain all setences
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [37]:
texts = [s.lower() for s in texts]
print(texts[:2])

["wall st. bears claw back into the black (reuters)reuters - short-sellers, wall street's dwindling\\band of ultra-cynics, are seeing green again.reuters - short-sellers, wall street's dwindling\\band of ultra-cynics, are seeing green again.", 'carlyle looks toward commercial aerospace (reuters)reuters - private investment firm carlyle group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.reuters - private investment firm carlyle group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.']


In [38]:
# num_words = 70
tk = Tokenizer(num_words = None, filters="", lower=True, char_level=True, oov_token='UNK')

In [39]:
tk.fit_on_texts(texts)

In [40]:
tk.word_index

{' ': 1,
 '!': 54,
 '"': 48,
 '#': 34,
 '$': 50,
 '&': 42,
 "'": 39,
 '(': 36,
 ')': 38,
 '*': 56,
 ',': 25,
 '-': 26,
 '.': 22,
 '/': 43,
 '0': 29,
 '1': 35,
 '2': 37,
 '3': 28,
 '4': 45,
 '5': 44,
 '6': 47,
 '7': 49,
 '8': 51,
 '9': 31,
 ':': 46,
 ';': 27,
 '=': 52,
 '?': 53,
 'UNK': 57,
 '\\': 40,
 '_': 55,
 'a': 3,
 'b': 21,
 'c': 13,
 'd': 11,
 'e': 2,
 'f': 18,
 'g': 17,
 'h': 12,
 'i': 5,
 'j': 32,
 'k': 24,
 'l': 10,
 'm': 16,
 'n': 7,
 'o': 6,
 'p': 15,
 'q': 33,
 'r': 9,
 's': 8,
 't': 4,
 'u': 14,
 'v': 23,
 'w': 20,
 'x': 30,
 'y': 19,
 'z': 41}

In [49]:
alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
char_dict

{'!': 41,
 '"': 45,
 '#': 51,
 '$': 52,
 '%': 53,
 '&': 55,
 "'": 44,
 '(': 64,
 ')': 65,
 '*': 56,
 '+': 59,
 ',': 38,
 '-': 60,
 '.': 40,
 '/': 46,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ':': 43,
 ';': 39,
 '<': 62,
 '=': 61,
 '>': 63,
 '?': 42,
 '@': 50,
 '[': 66,
 '\\': 47,
 ']': 67,
 '^': 54,
 '_': 49,
 '`': 58,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '{': 68,
 '|': 48,
 '}': 69,
 '~': 57}

In [50]:
print(max(char_dict.values()))

69


In [51]:
# char_dict['UNK'] = max(char_dict.values()) + 1
tk.word_index = char_dict # use char_dict to replace the tk.word_index
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1 

In [52]:
tk.word_index

{'!': 41,
 '"': 45,
 '#': 51,
 '$': 52,
 '%': 53,
 '&': 55,
 "'": 44,
 '(': 64,
 ')': 65,
 '*': 56,
 '+': 59,
 ',': 38,
 '-': 60,
 '.': 40,
 '/': 46,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ':': 43,
 ';': 39,
 '<': 62,
 '=': 61,
 '>': 63,
 '?': 42,
 '@': 50,
 'UNK': 70,
 '[': 66,
 '\\': 47,
 ']': 67,
 '^': 54,
 '_': 49,
 '`': 58,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '{': 68,
 '|': 48,
 '}': 69,
 '~': 57}

In [54]:
sequence = tk.texts_to_sequences(texts)
print(sequence[:1])

[[23, 1, 12, 12, 70, 19, 20, 40, 70, 2, 5, 1, 18, 19, 70, 3, 12, 1, 23, 70, 2, 1, 3, 11, 70, 9, 14, 20, 15, 70, 20, 8, 5, 70, 2, 12, 1, 3, 11, 70, 64, 18, 5, 21, 20, 5, 18, 19, 65, 18, 5, 21, 20, 5, 18, 19, 70, 60, 70, 19, 8, 15, 18, 20, 60, 19, 5, 12, 12, 5, 18, 19, 38, 70, 23, 1, 12, 12, 70, 19, 20, 18, 5, 5, 20, 44, 19, 70, 4, 23, 9, 14, 4, 12, 9, 14, 7, 47, 2, 1, 14, 4, 70, 15, 6, 70, 21, 12, 20, 18, 1, 60, 3, 25, 14, 9, 3, 19, 38, 70, 1, 18, 5, 70, 19, 5, 5, 9, 14, 7, 70, 7, 18, 5, 5, 14, 70, 1, 7, 1, 9, 14, 40, 18, 5, 21, 20, 5, 18, 19, 70, 60, 70, 19, 8, 15, 18, 20, 60, 19, 5, 12, 12, 5, 18, 19, 38, 70, 23, 1, 12, 12, 70, 19, 20, 18, 5, 5, 20, 44, 19, 70, 4, 23, 9, 14, 4, 12, 9, 14, 7, 47, 2, 1, 14, 4, 70, 15, 6, 70, 21, 12, 20, 18, 1, 60, 3, 25, 14, 9, 3, 19, 38, 70, 1, 18, 5, 70, 19, 5, 5, 9, 14, 7, 70, 7, 18, 5, 5, 14, 70, 1, 7, 1, 9, 14, 40]]


In [55]:
for i, s in enumerate(sequence):
    print(len(s))
    if i > 5:
        break

237
479
415
450
393
433
392


In [61]:
data = pad_sequences(sequence, maxlen=1014, padding='post')
print(data[0])

[23  1 12 ...  0  0  0]


In [62]:
print(data[0][:300])

[23  1 12 12 70 19 20 40 70  2  5  1 18 19 70  3 12  1 23 70  2  1  3 11
 70  9 14 20 15 70 20  8  5 70  2 12  1  3 11 70 64 18  5 21 20  5 18 19
 65 18  5 21 20  5 18 19 70 60 70 19  8 15 18 20 60 19  5 12 12  5 18 19
 38 70 23  1 12 12 70 19 20 18  5  5 20 44 19 70  4 23  9 14  4 12  9 14
  7 47  2  1 14  4 70 15  6 70 21 12 20 18  1 60  3 25 14  9  3 19 38 70
  1 18  5 70 19  5  5  9 14  7 70  7 18  5  5 14 70  1  7  1  9 14 40 18
  5 21 20  5 18 19 70 60 70 19  8 15 18 20 60 19  5 12 12  5 18 19 38 70
 23  1 12 12 70 19 20 18  5  5 20 44 19 70  4 23  9 14  4 12  9 14  7 47
  2  1 14  4 70 15  6 70 21 12 20 18  1 60  3 25 14  9  3 19 38 70  1 18
  5 70 19  5  5  9 14  7 70  7 18  5  5 14 70  1  7  1  9 14 40  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]


In [58]:
for i, s in enumerate(data):
    print(len(s))
    if i > 5:
        break

1014
1014
1014
1014
1014
1014
1014


### get classes 

In [28]:
print(train_df[0].unique())
class_list = train_df[0].values
print(len(class_list))
print(class_list[:10])
# get class
class_list = [x-1 for x in class_list]
print(set(class_list))
print(class_list[:10])

[3 4 2 1]
120000
[3 3 3 3 3 3 3 3 3 3]
{0, 1, 2, 3}
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [29]:
from keras.utils import to_categorical
classes = to_categorical(class_list)
print(classes[:10])

[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]
