# 表情预测

## 文本分类

In [1]:
import pandas as pd
from keras.utils.data_utils import get_file
import nb_utils

emotion_csv = get_file('text_emotion.csv', 
                       'https://www.crowdflower.com/wp-content/uploads/2016/07/text_emotion.csv')
emotion_df = pd.read_csv(emotion_csv)[:10000]
print(len(emotion_df))
emotion_df.head()

Using TensorFlow backend.


10000


Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [2]:
emotion_df['sentiment'].value_counts()

worry         3115
sadness       2216
neutral       1857
surprise       562
hate           535
happiness      469
love           369
relief         227
fun            211
empty          194
enthusiasm     132
boredom         69
anger           44
Name: sentiment, dtype: int64

### 简单分类器

In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

VOCAB_SIZE = 50000

tfidf_vec = TfidfVectorizer(max_features=VOCAB_SIZE)
label_encoder = LabelEncoder()

X = tfidf_vec.fit_transform(emotion_df['content'])
y = label_encoder.fit_transform(emotion_df['sentiment'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
# 贝叶斯方法
bayes = MultinomialNB()
bayes.fit(X_train, y_train)
predictions = bayes.predict(X_test)
precision_score(predictions, y_test, average='micro')

0.31454545454545457

In [5]:
# 多种分类器
classifiers = {'sgd': SGDClassifier(loss='hinge'),
               'svm': SVC(),
               'random_forrest': RandomForestClassifier()}

for lbl, clf in classifiers.items():
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(lbl, precision_score(predictions, y_test, average='micro'))



sgd 0.3
svm 0.314545454545
random_forrest 0.281515151515


In [8]:
from scipy.sparse import eye
d = eye(len(tfidf_vec.vocabulary_))
word_pred = bayes.predict_proba(d)

inverse_vocab = {idx: word for word, idx in tfidf_vec.vocabulary_.items()}
from collections import Counter, defaultdict
by_cls = defaultdict(Counter)
for word_idx, pred in enumerate(word_pred):
    for class_idx, score in enumerate(pred):
        cls = label_encoder.classes_[class_idx]
        by_cls[cls][inverse_vocab[word_idx]] = score

In [9]:
for k in by_cls:
    words = [x[0] for x in by_cls[k].most_common(5)]
    print(k, ':', ' '.join(words))

anger : confuzzled fridaaaayyyyy transtelecom aaaaaaaaaaa motherfuck
boredom : meanmillies ultra documentation deposits priecing
empty : kimbermuffin shakeyourjunk fooled emuhleepee megabyte6
enthusiasm : candy tatt tinabojo sarahbellum que
fun : universal sexxieluv magners parachute knight
happiness : 10th excellent dazzle chillin laughed
hate : hate grrrr zomberellamcfox unfair dropped
love : love sweetie lovely sayang loved
neutral : ogberry rainy nerd plurk natsmith88
relief : imagination samwilson1 clothes_w dhughesy allies
sadness : past sadly sometimes sad rip
surprise : ship wow sunburnt swpave juice
worry : problem hope worried throat find


### 训练深度模型

In [10]:
# 训练模型
from itertools import chain
from keras.preprocessing.sequence import pad_sequences
import numpy as np

chars = list(sorted(set(chain(*emotion_df['content']))))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
max_sequence_len = max(len(x) for x in emotion_df['content'])

char_vectors = []
for txt in emotion_df['content']:
    vec = np.zeros((max_sequence_len, len(char_to_idx)))
    vec[np.arange(len(txt)), [char_to_idx[ch] for ch in txt]] = 1
    char_vectors.append(vec)
print(len(char_vectors))
char_vectors = np.asarray(char_vectors, dtype=np.float16)
char_vectors = pad_sequences(char_vectors)
labels = label_encoder.transform(emotion_df['sentiment'])


def split(lst):
    training_count = int(0.9 * len(char_vectors))
    return lst[:training_count], lst[training_count:]

training_char_vectors, test_char_vectors = split(char_vectors)
training_labels, test_labels = split(labels)

char_vectors.shape

10000


(10000, 161, 95)

In [10]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Merge, LSTM
from keras.models import Model
from keras.layers.merge import Concatenate
from keras import regularizers

def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars), name='input')
    
    conv_1x = Conv1D(128, 6, activation='relu', padding='valid')(char_input)
    max_pool_1x = MaxPooling1D(6)(conv_1x)
    conv_2x = Conv1D(256, 6, activation='relu', padding='valid')(max_pool_1x)
    max_pool_2x = MaxPooling1D(6)(conv_2x)

    flatten = Flatten()(max_pool_2x)
    dense = Dense(128, 
                  activation='relu',
                  kernel_regularizer=regularizers.l2(0.01))(flatten)
    preds = Dense(num_labels, activation='softmax')(dense)

    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

char_cnn_model = create_char_cnn_model(len(char_to_idx), char_vectors.shape[1], len(label_encoder.classes_))
char_cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 161, 95)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 156, 128)          73088     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 26, 128)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 21, 256)           196864    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 3, 256)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               98432     
__________

In [11]:
char_cnn_model.fit(training_char_vectors, training_labels, epochs=20, batch_size=1024)
char_cnn_model.evaluate(test_char_vectors, test_labels)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[2.0439475917816163, 0.28199999999999997]

In [27]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Merge, LSTM
from keras.models import Model
from keras.layers.merge import Concatenate

def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars), name='input')
    
    layers = []
    for window in (5, 6, 7):
        conv_1x = Conv1D(128, window, activation='relu', padding='valid')(char_input)
        max_pool_1x = MaxPooling1D(window)(conv_1x)
        dropout_1x = Dropout(0.3)(max_pool_1x)
        conv_2x = Conv1D(128, window, activation='relu', padding='valid')(dropout_1x)
        max_pool_2x = MaxPooling1D(window)(conv_2x)
        dropout_2x = Dropout(0.3)(max_pool_2x)
        layers.append(dropout_2x)

    if len(layers) > 1:
        merged = Concatenate(axis=1)(layers)
    else:
        merged = layers[0]

    dropout = Dropout(0.3)(merged)
    
    flatten = Flatten()(dropout)
    dense = Dense(128, activation='relu')(flatten)
    preds = Dense(num_labels, activation='softmax')(dense)

    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

char_cnn_model = create_char_cnn_model(len(char_to_idx), char_vectors.shape[1], len(label_encoder.classes_))
char_cnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 161, 95)      0                                            
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 157, 128)     60928       input[0][0]                      
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 156, 128)     73088       input[0][0]                      
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 155, 128)     85248       input[0][0]                      
__________________________________________________________________________________________________
max_poolin

In [28]:
char_cnn_model.fit(training_char_vectors, training_labels, epochs=20, batch_size=1024)
char_cnn_model.evaluate(test_char_vectors, test_labels)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[1.9796250667572022, 0.29899999999999999]

### 特征化与数据预处理

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot

VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(emotion_df['content'])

# This may take a while to load
w2v, idf = nb_utils.load_w2v(tokenizer)

/home/lq/.keras/datasets/https_s3.amazonaws.com_dl4j_distribution_GoogleNews_vectors_negative300.bin.gz
/home/lq/.keras/datasets/https_s3.amazonaws.com_dl4j_distribution_GoogleNews_vectors_negative300.bin


In [11]:
tokens = tokenizer.texts_to_sequences(emotion_df['content'])
tokens = pad_sequences(tokens)

training_count = int(0.9 * len(tokens))
training_tokens, training_labels = tokens[:training_count], labels[:training_count]
test_tokens, test_labels = tokens[training_count:], labels[training_count:]

In [12]:
from keras import layers, models
import keras.backend as K


def make_embedding(name, vocab_size, embedding_size, weights=None, mask_zero=True):
    if weights is not None:
        return layers.Embedding(mask_zero=mask_zero, input_dim=vocab_size, 
                                output_dim=weights.shape[1], 
                                weights=[weights], trainable=False, 
                                name='%s/embedding' % name)
    else:
        return layers.Embedding(mask_zero=mask_zero, input_dim=vocab_size, 
                                output_dim=embedding_size,
                                name='%s/embedding' % name)

def create_unigram_model(vocab_size, embedding_size=None, embedding_weights=None, idf_weights=None):
    assert not (embedding_size is None and embedding_weights is None)
    message = layers.Input(shape=(None,), dtype='int32', name='message')
    
    embedding = make_embedding('message_vec', vocab_size, embedding_size, embedding_weights)
    idf = make_embedding('message_idf', vocab_size, embedding_size, idf_weights)

    mask = layers.Masking(mask_value=0)
    def _combine_and_sum(args):
        embedding, idf = args
        return K.sum(embedding * K.abs(idf), axis=1)

    sum_layer = layers.Lambda(_combine_and_sum, name='combine_and_sum')
    sum_msg = sum_layer([mask(embedding(message)), idf(message)])
    fc1 = layers.Dense(units=128, activation='relu')(sum_msg)
    categories = layers.Dense(units=len(label_encoder.classes_), activation='softmax')(fc1)
    
    model = models.Model(
        inputs=[message],
        outputs=categories,
    )
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.summary()
    return model

unigram_model = create_unigram_model(vocab_size=VOCAB_SIZE,
                                     embedding_weights=w2v,
                                     idf_weights=idf)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
message (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
message_vec/embedding (Embeddin (None, None, 300)    15000000    message[0][0]                    
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 300)    0           message_vec/embedding[0][0]      
__________________________________________________________________________________________________
message_idf/embedding (Embeddin (None, None, 1)      50000       message[0][0]                    
__________________________________________________________________________________________________
combine_an

In [13]:
unigram_model.fit(training_tokens, training_labels, epochs=10)
unigram_model.evaluate(test_tokens, test_labels, verbose=2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[2.8229330902099607, 0.30199999999999999]

### 内嵌学习

In [14]:
learned_embeddings_model = create_unigram_model(vocab_size=VOCAB_SIZE, embedding_size=25)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
message (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
message_vec/embedding (Embeddin (None, None, 25)     1250000     message[0][0]                    
__________________________________________________________________________________________________
masking_2 (Masking)             (None, None, 25)     0           message_vec/embedding[0][0]      
__________________________________________________________________________________________________
message_idf/embedding (Embeddin (None, None, 25)     1250000     message[0][0]                    
__________________________________________________________________________________________________
combine_an

In [15]:
learned_embeddings_model.fit(training_tokens, training_labels, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efcb594ce80>

In [16]:
learned_embeddings_model.evaluate(test_tokens, test_labels, verbose=2)

[1.9363989353179931, 0.32100000000000001]

### 更复杂模型

CNN

In [17]:
def create_cnn_model(vocab_size, embedding_size=None, embedding_weights=None):
    message = layers.Input(shape=(None,), dtype='int32', name='title')
    
    # The convolution layer in keras does not support masking, so we just allow
    # the embedding layer to learn an explicit value.
    embedding = make_embedding('message_vec', vocab_size, embedding_size, embedding_weights,
                              mask_zero=False)

    def _combine_sum(v):
        return K.sum(v, axis=1)

    cnn_1 = layers.Convolution1D(128, 3)
    cnn_2 = layers.Convolution1D(128, 3)
    cnn_3 = layers.Convolution1D(128, 3)
    
    global_pool = layers.GlobalMaxPooling1D()
    local_pool = layers.MaxPooling1D(strides=1, pool_size=3)

    cnn_encoding = global_pool(cnn_3(local_pool(cnn_2(local_pool(cnn_1(embedding(message)))))))
    fc1 = layers.Dense(units=128, activation='elu')(cnn_encoding)
    categories = layers.Dense(units=len(label_encoder.classes_), activation='softmax')(fc1)
    model = models.Model(
        inputs=[message],
        outputs=[categories],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [18]:
cnn_model = create_cnn_model(VOCAB_SIZE, embedding_weights=w2v)
cnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
message_vec/embedding (Embeddin (None, None, 300)    15000000    title[0][0]                      
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, None, 128)    115328      message_vec/embedding[0][0]      
__________________________________________________________________________________________________
max_pooling1d_1 (MaxPooling1D)  (None, None, 128)    0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
__________

In [19]:
cnn_model.fit(training_tokens, training_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efbd44f04a8>

In [20]:
cnn_model.evaluate(test_tokens, test_labels)



[4.8914266662597656, 0.252]

LSTM

In [21]:
def create_lstm_model(vocab_size, embedding_size=None, embedding_weights=None):
    message = layers.Input(shape=(None,), dtype='int32', name='title')
    embedding = make_embedding('message_vec', vocab_size, embedding_size, embedding_weights)(message)

    lstm_1 = layers.LSTM(units=128, return_sequences=False)(embedding)
#     lstm_2 = layers.LSTM(units=128, return_sequences=False)(lstm_1)
    category = layers.Dense(units=len(label_encoder.classes_), activation='softmax')(lstm_1)
    
    model = models.Model(
        inputs=[message],
        outputs=[category],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [22]:
lstm_model = create_lstm_model(VOCAB_SIZE, embedding_weights=w2v)
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
title (InputLayer)           (None, None)              0         
_________________________________________________________________
message_vec/embedding (Embed (None, None, 300)         15000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_7 (Dense)              (None, 13)                1677      
Total params: 15,221,325
Trainable params: 221,325
Non-trainable params: 15,000,000
_________________________________________________________________


In [23]:
lstm_model.fit(training_tokens, training_labels, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efcb6463710>

In [24]:
lstm_model.evaluate(test_tokens, test_labels)



[1.9011550903320313, 0.34499999999999997]

模型比较

In [25]:
predictions = {
    'lstm': lstm_model.predict(test_tokens[:100]),
    'char_cnn': char_cnn_model.predict(test_char_vectors[:100]),
    'cnn': cnn_model.predict(test_tokens[:100]),
    'unigram': unigram_model.predict(test_tokens[:100]),
}

NameError: name 'char_cnn_model' is not defined

In [None]:
# Make a dataframe just for test data

pd.options.display.max_colwidth = 128
test_df = emotion_df[training_count:training_count+100].reset_index()
eval_df = pd.DataFrame({
    'content': test_df['content'],
    'true': test_df['sentiment'],
    'lstm': [label_encoder.classes_[np.argmax(x)] for x in predictions['lstm']],
    'cnn': [label_encoder.classes_[np.argmax(x)] for x in predictions['cnn']],
    'char_cnn': [label_encoder.classes_[np.argmax(x)] for x in predictions['char_cnn']],    
    'unigram': [label_encoder.classes_[np.argmax(x)] for x in predictions['unigram']],
})
eval_df = eval_df[['content', 'true', 'lstm', 'cnn', 'char_cnn', 'unigram']]
eval_df.head(10)

In [None]:
eval_df[eval_df['lstm'] != eval_df['true']].head(10)

### Twitter 分析

In [None]:
import twitter
import emoji

In [None]:
# Fill these in!

CONSUMER_KEY = 'xbMuxcJpRTiVGt2C2EYnA'
CONSUMER_SECRET = '2DbQTsvIptkPTdaUcos8DDvQH9fzO0hNjJpUT2uVzQ'
ACCESS_TOKEN = '7319442-EDm4CPxL7W4KkZcGWRMJNVHp88W5OH9vgblu898fg'
ACCESS_SECRET = '5ZxJSbqXhG7uhgXzTFWf9XhkfsxxinlPRXyDTzbA9w'

In [None]:
api = twitter.Twitter(
    auth=twitter.OAuth(
        consumer_key=CONSUMER_KEY,
        consumer_secret=CONSUMER_SECRET,
        token=ACCESS_TOKEN,
        token_secret=ACCESS_SECRET,
    ))

stream = twitter.TwitterStream(
    auth=twitter.OAuth(
        consumer_key=CONSUMER_KEY,
        consumer_secret=CONSUMER_SECRET,
        token=ACCESS_TOKEN,
        token_secret=ACCESS_SECRET,
    ))

In [None]:
import itertools
def has_emoji(tweet):
    if tweet.get('lang') != 'en':
        return False
    return any(ch for ch in tweet.get('text', '') if ch in emoji.UNICODE_EMOJI)

%time st = list(itertools.islice(filter(has_emoji, stream.statuses.sample()), 0, 10))

In [None]:
len(st), [t.get('text', None) for t in st][:10]

## Twitter 表情分析

In [1]:
import random
import twitter
import emoji
import itertools
import pandas as pd
from itertools import chain
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
import keras.callbacks
import json

import os
import nb_utils
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.layers import Merge, LSTM, Embedding, GlobalMaxPooling1D
from keras.models import Model
from keras.layers.merge import Concatenate, Average

from gensim.models import Word2Vec

Using TensorFlow backend.


In [2]:
# Fill these in!

CONSUMER_KEY = 'xbMuxcJpRTiVGt2C2EYnA'
CONSUMER_SECRET = '2DbQTsvIptkPTdaUcos8DDvQH9fzO0hNjJpUT2uVzQ'
ACCESS_TOKEN = '7319442-EDm4CPxL7W4KkZcGWRMJNVHp88W5OH9vgblu898fg'
ACCESS_SECRET = '5ZxJSbqXhG7uhgXzTFWf9XhkfsxxinlPRXyDTzbA9w'

In [3]:
auth=twitter.OAuth(
    consumer_key=CONSUMER_KEY,
    consumer_secret=CONSUMER_SECRET,
    token=ACCESS_TOKEN,
    token_secret=ACCESS_SECRET,
)

status_stream = twitter.TwitterStream(auth=auth).statuses

[x['text'] for x in itertools.islice(status_stream.sample(), 0, 5) if x.get('text')]

URLError: <urlopen error [Errno 110] Connection timed out>

In [None]:
status_stream = twitter.TwitterStream(auth=auth).statuses

def english_has_emoji(tweet):
    if tweet.get('lang') != 'en':
        return False
    return any(ch for ch in tweet.get('text', '') if ch in emoji.UNICODE_EMOJI)

%time tweets = list(itertools.islice(filter(english_has_emoji, status_stream.sample()), 0, 100))

In [None]:
stripped = []
for tweet in tweets:
    text = tweet['text']
    emojis = {ch for ch in text if ch in emoji.UNICODE_EMOJI}
    if len(emojis) == 1:
        emoiji = emojis.pop()
        text = ''.join(ch for ch in text if ch != emoiji)
        stripped.append((text, emoiji))
len(stripped)

### 使用 CNN

In [None]:
all_tweets = pd.read_csv('data/emojis.csv')
all_tweets['emoji'].value_counts()

In [None]:
tweets = all_tweets.groupby('emoji').filter(lambda c:len(c) > 1000)
tweets['emoji'].value_counts()

In [None]:
max(tweets['text'], key=lambda t:len(t))

In [None]:
chars = list(sorted(set(chain(*tweets['text']))))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
max_sequence_len = max(len(x) for x in tweets['text'])

emojis = list(sorted(set(tweets['emoji'])))
emoji_to_idx = {em: idx for idx, em in enumerate(emojis)}
emojis[:10]

train_tweets, test_tweets = train_test_split(tweets, test_size=0.1)

In [None]:
def data_generator(tweets, batch_size):
    while True:
        if batch_size is None:
            batch = tweets
            batch_size = batch.shape[0]
        else:
            batch = tweets.sample(batch_size)
        X = np.zeros((batch_size, max_sequence_len, len(chars)))
        y = np.zeros((batch_size,))
        for row_idx, (_, row) in enumerate(batch.iterrows()):
            y[row_idx] = emoji_to_idx[row['emoji']]
            for ch_idx, ch in enumerate(row['text']):
                X[row_idx, ch_idx, char_to_idx[ch]] = 1
        yield X, y

next(data_generator(tweets, 10))

In [None]:
def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars), name='char_cnn_input')
    
    conv_1x = Conv1D(128, 6, activation='relu', padding='valid')(char_input)
    max_pool_1x = MaxPooling1D(4)(conv_1x)
    conv_2x = Conv1D(256, 6, activation='relu', padding='valid')(max_pool_1x)
    max_pool_2x = MaxPooling1D(4)(conv_2x)

    flatten = Flatten()(max_pool_2x)
    dense = Dense(128, activation='relu')(flatten)
    preds = Dense(num_labels, activation='softmax', name='char_cnn_predictions')(dense)

    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

char_cnn_model = create_char_cnn_model(len(char_to_idx), max_sequence_len, len(emojis))
char_cnn_model.summary()

In [None]:
early = keras.callbacks.EarlyStopping(monitor='loss',
                              min_delta=0.03,
                              patience=2,
                              verbose=0, mode='auto')

BATCH_SIZE = 512
char_cnn_model.fit_generator(
    data_generator(train_tweets, batch_size=BATCH_SIZE),
    epochs=20,
    steps_per_epoch=len(train_tweets) / BATCH_SIZE,
    verbose=2,
    callbacks=[early]
)

In [None]:
char_cnn_model.evaluate_generator(
    data_generator(test_tweets, batch_size=BATCH_SIZE),
    steps=len(test_tweets) / BATCH_SIZE
)

In [None]:
with open('./zoo/07/emoji_chars.json', 'w') as fout:
    json.dump({
        'emojis': ''.join(emojis),
        'char_to_idx': char_to_idx,
        'max_sequence_len': max_sequence_len,
    }, fout)
char_cnn_model.save('./zoo/07/char_cnn_model.h5')
char_cnn_model.save_weights('./zoo/07/char_cnn_model_weights.h5')

In [None]:
pd.options.display.max_colwidth = 128
inspect_tweets = test_tweets.sample(100)
predicted = char_cnn_model.predict_generator(data_generator(inspect_tweets, batch_size=None), steps=1)
show = pd.DataFrame({
    'text': inspect_tweets['text'],
    'true': inspect_tweets['emoji'],
    'pred': [emojis[np.argmax(x)] for x in predicted],
})
show = show[['text', 'true', 'pred']]
show.head(10)

In [None]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Merge, LSTM
from keras.models import Model
from keras.layers.merge import Concatenate

def create_char_cnn_model2(num_chars, max_sequence_len, num_labels, drop_out=0.25):
    char_input = Input(shape=(max_sequence_len, num_chars), name='char_cnn_input')
    
    layers = []
    for window in (4, 5, 6):
        conv_1x = Conv1D(128, window, activation='relu', padding='valid')(char_input)
        max_pool_1x = MaxPooling1D(4)(conv_1x)
        dropout_1x = Dropout(drop_out)(max_pool_1x)
        conv_2x = Conv1D(256, window, activation='relu', padding='valid')(dropout_1x)
        max_pool_2x = MaxPooling1D(4)(conv_2x)
        dropout_2x = Dropout(drop_out)(max_pool_2x)
        layers.append(dropout_2x)

    merged = Concatenate(axis=1)(layers)

    dropout = Dropout(drop_out)(merged)
    
    flatten = Flatten()(dropout)
    dense = Dense(128, activation='relu')(flatten)
    preds = Dense(num_labels, activation='softmax', name='char_cnn_predictions')(dense)

    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

char_cnn_model2 = create_char_cnn_model2(len(char_to_idx), max_sequence_len, len(emojis))
char_cnn_model2.summary()

In [None]:
BATCH_SIZE = 2048
char_cnn_model2.fit_generator(
    data_generator(train_tweets, batch_size=BATCH_SIZE),
    epochs=30,
    steps_per_epoch=len(train_tweets) / BATCH_SIZE,
    verbose=2,
    callbacks=[early]
)

In [None]:
char_cnn_model2.evaluate_generator(
    data_generator(test_tweets, batch_size=BATCH_SIZE),
    steps=len(test_tweets) / BATCH_SIZE
)

**数据预处理**

In [None]:
VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(tweets['text'])

training_tokens = tokenizer.texts_to_sequences(train_tweets['text'])
test_tokens = tokenizer.texts_to_sequences(test_tweets['text'])
max_num_tokens = max(len(x) for x in chain(training_tokens, test_tokens))
training_tokens = pad_sequences(training_tokens, maxlen=max_num_tokens)
test_tokens = pad_sequences(test_tokens, maxlen=max_num_tokens)

training_labels = np.asarray([emoji_to_idx[em] for em in train_tweets['emoji']])
test_labels = np.asarray([emoji_to_idx[em] for em in test_tweets['emoji']])

In [None]:
def load_weights(tokenizer):
    model = Word2Vec.load('data/twitter_w2v.model')
    w2v = np.zeros((tokenizer.num_words, w2v_model.syn0.shape[1]))
    for k, v in tokenizer.word_index.items():
        if v >= tokenizer.num_words:
            continue
        if k in w2v_model:
            w2v[v] = w2v_model[k]
    return w2v

# This may take a while to load
#w2v = load_weights(tokenizer)
#model = Word2Vec.load('data/twitter_w2v.model')
w2v = np.zeros((tokenizer.num_words, model.wv.syn0.shape[1]))
found = 0
for k, v in tokenizer.word_index.items():
    if v >= tokenizer.num_words:
        continue
    if k in model:
        w2v[v] = model[k]
        found += 1
found, tokenizer.num_words

### 词级别

In [None]:
def create_cnn_model(vocab_size, embedding_size=None, embedding_weights=None, drop_out=0.2):
    message = Input(shape=(max_num_tokens,), dtype='int32', name='cnn_input')
    
    
    # The convolution layer in keras does not support masking, so we just allow
    # the embedding layer to learn an explicit value.
    embedding = Embedding(mask_zero=False, input_dim=vocab_size, 
                          output_dim=embedding_weights.shape[1], 
                          weights=[embedding_weights],
                          trainable=True,
                          name='cnn_embedding')(message)
    
    global_pools = []
    for window in 2, 3:
        conv_1x = Conv1D(128, window, activation='relu', padding='valid')(embedding)
        max_pool_1x = MaxPooling1D(2)(conv_1x)
        conv_2x = Conv1D(256, window, activation='relu', padding='valid')(max_pool_1x)
        max_pool_2x = MaxPooling1D(2)(conv_2x)
        conv_3x = Conv1D(256, window, activation='relu', padding='valid')(max_pool_2x)

        global_pools.append(GlobalMaxPooling1D()(conv_3x))

    merged = Concatenate(axis=1)(global_pools)
    fc1 = Dense(units=128, activation='elu')(merged)
    preds = Dense(units=len(emojis), activation='softmax', name='cnn_predictions')(fc1)
    model = Model(
        inputs=[message],
        outputs=[preds],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

cnn_model = create_cnn_model(VOCAB_SIZE, embedding_weights=w2v)
cnn_model.summary()

In [None]:
cnn_model.fit(training_tokens, training_labels, epochs=5)

In [None]:
def create_lstm_model(vocab_size, embedding_size=None, embedding_weights=None):
    message = Input(shape=(None,), dtype='int32', name='lstm_input')
    embedding = Embedding(mask_zero=False, input_dim=vocab_size, 
                          output_dim=embedding_weights.shape[1], 
                          weights=[embedding_weights],
                          trainable=True,
                          name='lstm_embedding')(message)

    lstm_1 = LSTM(units=128, return_sequences=False)(embedding)
    preds = Dense(units=len(emojis), activation='softmax', name='lstm_predictions')(lstm_1)
    
    model = Model(
        inputs=[message],
        outputs=[preds],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [None]:
lstm_model = create_lstm_model(VOCAB_SIZE, embedding_weights=w2v)
lstm_model.summary()

In [None]:
lstm_model.fit(training_tokens, training_labels, epochs=12, batch_size=1024, callbacks=[early])

In [None]:
lstm_model.evaluate(test_tokens, test_labels)

**模型比较**

In [None]:
test_char_vectors, _ = next(data_generator(test_tweets, None)) 

In [None]:
predictions = {
    label: [emojis[np.argmax(x)] for x in pred]
    for label, pred in (
        ('lstm', lstm_model.predict(test_tokens[:100])),
        ('char_cnn', char_cnn_model.predict(test_char_vectors[:100])),
        ('cnn', cnn_model.predict(test_tokens[:100])),
    )
}

In [None]:
# Make a dataframe just for test data
pd.options.display.max_colwidth = 128
test_df = test_tweets[:100].reset_index()
eval_df = pd.DataFrame({
    'content': test_df['text'],
    'true': test_df['emoji'],
    **predictions
})
eval_df[['content', 'true', 'char_cnn', 'cnn', 'lstm']].head(25)

**定性评估**

In [None]:
eval_df[eval_df['lstm'] != eval_df['true']].head(10)

In [None]:
def combined_data_generator(tweets, tokens, batch_size):
    tweets = tweets.reset_index()
    while True:
        batch_idx = random.sample(range(len(tweets)), batch_size)
        tweet_batch = tweets.iloc[batch_idx]
        token_batch = tokens[batch_idx]
        char_vec = np.zeros((batch_size, max_sequence_len, len(chars)))
        token_vec = np.zeros((batch_size, max_num_tokens))
        y = np.zeros((batch_size,))
        for row_idx, (token_row, (_, tweet_row)) in enumerate(zip(token_batch, tweet_batch.iterrows())):
            y[row_idx] = emoji_to_idx[tweet_row['emoji']]
            for ch_idx, ch in enumerate(tweet_row['text']):
                char_vec[row_idx, ch_idx, char_to_idx[ch]] = 1
            token_vec[row_idx, :] = token_row
        yield {'char_cnn_input': char_vec, 'cnn_input': token_vec, 'lstm_input': token_vec}, y

d, y = next(combined_data_generator(train_tweets, training_tokens, 5))
d['lstm_input'].shape

In [None]:
def prediction_layer(model):
    layers = [layer for layer in model.layers if layer.name.endswith('_predictions')]
    return layers[0].output

def create_ensemble(*models):
    inputs = [model.input for model in models]
    predictions = [prediction_layer(model) for model in models]
    merged = Average()(predictions)
    model = Model(
        inputs=inputs,
        outputs=[merged],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model


ensemble = create_ensemble(char_cnn_model2, cnn_model, lstm_model)
ensemble.summary()

In [None]:
BATCH_SIZE = 512
ensemble.fit_generator(
    combined_data_generator(train_tweets, training_tokens, BATCH_SIZE),
    epochs=20,
    steps_per_epoch=len(train_tweets) / BATCH_SIZE,
    verbose=2,
    callbacks=[early]
)

In [None]:
ensemble.evaluate_generator(
    combined_data_generator(test_tweets, test_tokens, BATCH_SIZE),
    steps=len(test_tweets) / BATCH_SIZE
)

In [None]:
len(train_tweets)

# 句子转换

## seq2seq

In [1]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

import nltk
from nltk.corpus import wordnet as wn
import inflect

from keras.models import Sequential
from keras import layers
import numpy as np
from collections import Counter, defaultdict

from gensim.utils import tokenize
from itertools import groupby

from keras.models import Input, Model
from keras.layers import Dense, Dropout
from keras.layers import LSTM, RepeatVector
from keras.layers.wrappers import TimeDistributed

Using TensorFlow backend.


In [2]:
p = inflect.engine()

pairs = {}
for synset in wn.all_synsets('n'):
    word = synset.name().split('.', 1)[0]
    if not word in pairs:
        pairs[word] = p.plural(word)
len(pairs)

67176

In [3]:
with open('./data/plurals.txt', 'w') as fout:
    for k in sorted(pairs):
        if '_' in k or '-' in k:
            continue
        if k.isdigit():
            continue
        fout.write('%s\t%s\n' % (k, pairs[k]))

In [4]:
p.plural('no')

'noes'

In [5]:
class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        """Initialize character table.
        # Arguments
            chars: Characters that can appear in the input.
        """
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    def encode(self, C, num_rows):
        """One hot encode given string C.
        # Arguments
            num_rows: Number of rows in the returned one hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x

    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in x)
    
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'

In [6]:
# Parameters for the model and dataset.
INVERT = True

questions = []
expected = []
seen = set()
#with open('data/en_de.txt') as fin:
with open('data/plurals.txt') as fin:
    for line in fin:
        en, de = line.strip().split('\t')
        questions.append(en)
        expected.append(de)

max_question_len = max(len(q) for q in questions)
max_expected_len = max(len(e) for e in expected)
questions = [' ' * (max_question_len - len(q)) + q for q in questions]
expected = [e + ' ' * (max_expected_len - len(e)) for e in expected]
if INVERT:
    questions = [q[::-1] for q in questions]

print('Total addition questions:', len(questions))

Total addition questions: 39929


In [7]:
chars = set(ch for k, v in zip(questions, expected) for ch in k + v)
ctable = CharacterTable(chars)
len(chars)

40

In [8]:
print('Vectorization...')
x = np.zeros((len(questions), max_question_len, len(chars)), dtype=np.bool)
y = np.zeros((len(questions), max_expected_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(questions):
    x[i] = ctable.encode(sentence, max_question_len)
for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, max_expected_len)
print('done')

Vectorization...
done


In [9]:
# Shuffle (x, y) in unison as the later parts of x will almost all be larger
# digits.
indices = np.arange(len(y))
np.random.shuffle(indices)
x = x[indices]
y = y[indices]

# Explicitly set apart 10% for validation data that we never train over.
split_at = len(x) - len(x) // 10
(x_train, x_val) = x[:split_at], x[split_at:]
(y_train, y_val) = y[:split_at], y[split_at:]

print('Training Data:')
print(x_train.shape)
print(y_train.shape)

print('Validation Data:')
print(x_val.shape)
print(y_val.shape)

Training Data:
(35937, 31, 40)
(35937, 32, 40)
Validation Data:
(3992, 31, 40)
(3992, 32, 40)


In [10]:
# The below is taken from: https://github.com/keras-team/keras/blob/master/examples/addition_rnn.py
RNN = layers.LSTM
HIDDEN_SIZE = 128
LAYERS = 1

print('Build model...')
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE.
# Note: In a situation where your input sequences have a variable length,
# use input_shape=(None, num_feature).
model.add(RNN(HIDDEN_SIZE, input_shape=(max_question_len, len(chars))))
# As the decoder RNN's input, repeatedly provide with the last hidden state of
# RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
# length of output, e.g., when DIGITS=3, max output is 999+999=1998.
#model.add(layers.Dropout(DROP_OUT))
model.add(layers.RepeatVector(max_expected_len))
# The decoder RNN could be multiple layers stacked or a single layer.
for _ in range(LAYERS):
    # By setting return_sequences to True, return not only the last output but
    # all the outputs so far in the form of (num_samples, timesteps,
    # output_dim). This is necessary as TimeDistributed in the below expects
    # the first dimension to be the timesteps.
    model.add(RNN(HIDDEN_SIZE, return_sequences=True))
#    model.add(layers.Dropout(DROP_OUT))

# Apply a dense layer to the every temporal slice of an input. For each of step
# of the output sequence, decide which character should be chosen.
model.add(layers.TimeDistributed(layers.Dense(len(chars))))
model.add(layers.Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               86528     
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 32, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 32, 128)           131584    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 32, 40)            5160      
_________________________________________________________________
activation_1 (Activation)    (None, 32, 40)            0         
Total params: 223,272
Trainable params: 223,272
Non-trainable params: 0
_________________________________________________________________


In [11]:
def create_seq2seq(num_nodes, num_layers):
    question = Input(shape=(max_question_len, len(chars)), name='question')
    repeat = RepeatVector(max_expected_len)(question)
    prev = question
    for _ in range(num_layers):
        lstm = LSTM(num_nodes, return_sequences=True, name='lstm_layer_%d' % (i + 1))(prev)
        prev = lstm
    dense = TimeDistributed(Dense(num_chars, name='dense', activation='softmax'))(prev)
    model = Model(inputs=[input], outputs=[dense])
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

seq2seq = create_seq2seq(128, 1)

ValueError: Input 0 is incompatible with layer repeat_vector_2: expected ndim=2, found ndim=3

In [13]:
BATCH_SIZE = 2048

# Train the model each generation and show predictions against the validation
# dataset.
for iteration in range(1, 20):
    model.fit(x_train, y_train,
              batch_size=BATCH_SIZE,
              epochs=10,
              validation_data=(x_val, y_val))
    print()
    print('-' * 50)
    print('Iteration', iteration)
    # Select 10 samples from the validation set at random so we can visualize
    # errors.
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print(q[::-1] if INVERT else q, '(%s)' % correct, '-', guess)

Train on 35937 samples, validate on 3992 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 1
                       monarchy (monarchies                      ) - sareeiisgs                      
                          ajuga (ajugas                          ) - saaeas                          
                      brushwood (brushwoods                      ) - careeaieas                      
                     lutjanidae (lutjanidaes                     ) - careeaiises                     
                       centaury (centauries                      ) - sareeiises                      
                          print (prints                          ) - saaets                          
                     karyolysis (karyolyses                      ) - careeaaiises                    
                     commission (commissions                  

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 4
                         ocelot (ocelots                         ) - sariers                         
                       outreach (outreaches                      ) - sareeiiies                      
                         eyecup (eyecups                         ) - sarteas                         
                       gentiana (gentianas                       ) - sareeioas                       
                           utah (utahs                           ) - saale                           
                         bowleg (bowlegs                         ) - sarines                         
                          kamba (kambas                          ) - sareas                          
                         tanakh (tanakhs                         ) - sarteas                         
                        cetacea (cetaceas                       

Epoch 10/10

--------------------------------------------------
Iteration 7
                            pot (pots                            ) - sats                            
                        malamud (malamuds                        ) - sartiers                        
                        fucales (fucale                          ) - sareiias                        
                      eptesicus (eptesicuses                     ) - careeatises                     
                      alchemist (alchemists                      ) - sareetints                      
                        getaway (getaways                        ) - sareiias                        
                        marstan (marstans                        ) - sartiens                        
                          dijon (dijons                          ) - sarens                          
                   seismography (seismographies                  ) - cereoooaaiises                  
      

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 11
                    hypokalemia (hypokalemias                    ) - cereooaaiias                    
                         girdle (girdles                         ) - sariies                         
                      detriment (detriments                      ) - sareetints                      
                     repetition (repetitions                     ) - cereeatiens                     
                          lagos (lago                            ) - sariis                          
                 psittaciformes (psittaciforme                   ) - cereoooiiatiias                 
                    aminopyrine (aminopyrines                    ) - cereoaatises                    
                    requirement (requirements                    ) - cereooatints                    
                   

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--------------------------------------------------
Iteration 14
                   refractivity (refractivities                  ) - aereeeeaatiies                  
                      greenberg (greenbergs                      ) - aereeeints                      
                          grunt (grunts                          ) - saants                          
                         bessel (bessels                         ) - solions                         
                         crater (craters                         ) - tarters                         
                    obliqueness (obliquenesses                   ) - coooooooiises                   
                      tablature (tablatures                      ) - aereetiies                      
                        guthrie (guthries                        ) - sartines                        
                           toea (toeas                         


--------------------------------------------------
Iteration 17
                       krakatau (krakataus                       ) - sareaiias                       
                          malay (malays                          ) - saraas                          
                          swede (swedes                          ) - baaies                          
                       romanism (romanisms                       ) - corssisms                       
                       yerupaja (yerupajas                       ) - sorooiias                       
                           aids (aid                             ) - souhs                           
                     percolator (percolators                     ) - careoatiors                     
                      crabgrass (crabgrasses                     ) - poroosssses                     
                          pagan (pagans                          ) - sarans                          
                 

In [14]:
shakespeare = strip_headers(load_etext(100))
tokens = [tuple(word) for word in tokenize(plays, to_lower=True)]
token_counts = Counter(tokens)

pairs = [(token[i], token[i + 1], token_id) 
         for token_id, token in enumerate(tokens) 
         for i in range(len(token) - 1)]

pairs[10], tokens[5]

NameError: name 'plays' is not defined

## gutenberg

In [15]:
import requests
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from gutenberg.acquire.text import UnknownDownloadUriException
import re
from gensim.utils import tokenize
import random
import nltk
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
import os
import glob
import json

In [16]:
with open('./data/gutenberg_index.json') as fin:
    authors = json.load(fin)
recent = [x for x in authors if 'birthdate' in x and x['birthdate'] > 1830]
[(x['name'], x['birthdate'], x['english_books']) for x in recent[:5]]

FileNotFoundError: [Errno 2] No such file or directory: './data/gutenberg_index.json'

In [17]:
print(list_supported_metadatas())

NameError: name 'list_supported_metadatas' is not defined

In [18]:
PARAGRAPH_SPLIT_RE = re.compile(r'\n *\n+')

def extract_conversations(text, quote='"'):
    paragraphs = PARAGRAPH_SPLIT_RE.split(text.strip())
    conversations = [['']]
    for paragraph in paragraphs:
        chunks = paragraph.replace('\n', ' ').split(quote)
        for i in range((len(chunks) + 1) // 2):
            if (len(chunks[i * 2]) > 100 or len(chunks) == 1) and conversations[-1] != ['']:
                if conversations[-1][-1] == '':
                    del conversations[-1][-1]
                conversations.append([''])
            if i * 2 + 1 < len(chunks):
                chunk = chunks[i * 2 + 1]
                if chunk:
                    if conversations[-1][-1]:
                        if chunk[0] >= 'A' and chunk[0] <= 'Z':
                            if conversations[-1][-1].endswith(','):
                                conversations[-1][-1] = conversations[-1][-1][:-1]
                            conversations[-1][-1] += '.'
                        conversations[-1][-1] += ' '
                    conversations[-1][-1] += chunk
        if conversations[-1][-1]:
            conversations[-1].append('')

    return [x for x in conversations if len(x) > 1]


conversations = extract_conversations(strip_headers(load_etext(10008).strip()))
sum(len(x) for x in conversations)

InvalidSchema: Missing dependencies for SOCKS support.

In [None]:
LATIN_1_CHARS = (
    (u'\xe2\x80\x99', "'"),
    (u'\xc3\xa9', 'e'),
    (u'\xe2\x80\x90', '-'),
    (u'\xe2\x80\x91', '-'),
    (u'\xe2\x80\x92', '-'),
    (u'\xe2\x80\x93', '-'),
    (u'\xe2\x80\x94', '-'),
    (u'\xe2\x80\x94', '-'),
    (u'\xe2\x80\x98', "'"),
    (u'\xe2\x80\x9b', "'"),
    (u'\xe2\x80\x9c', '"'),
    (u'\xe2\x80\x9c', '"'),
    (u'\xe2\x80\x9d', '"'),
    (u'\xe2\x80\x9e', '"'),
    (u'\xe2\x80\x9f', '"'),
    (u'\xe2\x80\xa6', '...'),
    (u'\xe2\x80\xb2', "'"),
    (u'\xe2\x80\xb3', "'"),
    (u'\xe2\x80\xb4', "'"),
    (u'\xe2\x80\xb5', "'"),
    (u'\xe2\x80\xb6', "'"),
    (u'\xe2\x80\xb7', "'"),
    (u'\xe2\x81\xba', "+"),
    (u'\xe2\x81\xbb', "-"),
    (u'\xe2\x81\xbc', "="),
    (u'\xe2\x81\xbd', "("),
    (u'\xe2\x81\xbe', ")")
)

books = 0
for author in recent[:1000]:
    for book in author['books']:
        books += 1
        try:
            txt = strip_headers(load_etext(int(book[0]))).strip()
        except UnknownDownloadUriException:
            continue
        for ch1, ch2 in LATIN_1_CHARS:
            txt = txt.replace(ch1, ch2)
        conversations += extract_conversations(txt)

print(len(conversations), books)