In [26]:
def remove_numbers(tweet):
    '''Returns tweet that doesn't contain standalone digits'''
    return ' '.join(['<number>' if w.isdigit() else w for w in tweet.split()])

In [28]:
remove_numbers('hello darkness0 1 123 whoa 2020bernie 124124')

'hello darkness0 <number> <number> whoa 2020bernie <number>'

In [49]:
def replace_elong(tweet):
    '''Replaces words with repetitions by <elong>'''
    words = tweet.split()
    corrected = [re.sub(r'(\w)\1{2,}',r'\1', w) for w in words]
    out = []
    for c,w in zip(corrected,words):
        out.append(c)
        if w != c:
            out.append('<elong>')
    return ' '.join(out)

In [51]:
replace_elong('hiiiokii braaaah whatsuuppp hello')

'hiokii <elong> brah <elong> whatsuup <elong> hello'

In [2]:
X,y = load_tweets('../data/clean_train.txt', True)

In [2]:
from collections import Counter

# load data
X, _ = load_tweets('../data/clean_train.txt', True)

# get all unique words with their frequence
counter = Counter()
for tweet in X:
    for word in tweet.split():
        counter[word] += 1
counter = counter.most_common()

In [20]:
vocab = {word: i for i,(word,count) in enumerate(counter) if count >= 5}

In [18]:
from scipy.sparse import *
import numpy as np

In [23]:
# compute coocurence matrix
data, row, col = [], [], []
c = 1
for line in X:
    tokens = [vocab.get(t, -1) for t in line.strip().split()]
    tokens = [t for t in tokens if t >= 0]
    for t in tokens:
        for t2 in tokens:
            data.append(1)
            row.append(t)
            col.append(t2)

    if c % 10000 == 0:
        print('\r%.2f%%' % (c / len(X) * 100), end='')
    c += 1
cooc = coo_matrix((data, (row, col)))
print("\nSumming duplicates")
cooc.sum_duplicates()

99.27%
Summing duplicates


In [22]:
cooc

<15332x15332 sparse matrix of type '<class 'numpy.int32'>'
	with 4567020 stored elements in COOrdinate format>

In [7]:
all_words = set()
for tweet in X:
    for word in tweet.split():
        all_words.add(word)

In [9]:
len(all_words)

92486

In [10]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [14]:
len(y_test)

18133

In [19]:
from utils import *
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.layers import Flatten, Dense

# load data
X, y = load_tweets('../data/clean_train.txt', True)

# split to test and train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# get all unique words
all_words = set()
for tweet in X:
    for word in tweet.split():
        all_words.add(word)
vocab_length = len(all_words)
print('Vocabulary size: %d' % (vocab_length))

# create tokenizer with an extra token for unkown words
tokenizer = Tokenizer(num_words=vocab_length, oov_token=1)
tokenizer.fit_on_texts(X_train)

Using TensorFlow backend.


Vocabulary size: 92486


In [29]:
from keras.models import Sequential

# tokenize data
X_train_tokenized = tokenizer.texts_to_sequences(X_train)

# get longest tweet and pad others with 0s to obtain same length
max_length = max([len(x) for x in X_train])
X_train_padded = pad_sequences(X_train_tokenized, max_length, padding='post')
print('Longest tweet: %d' % (max_length))

# create model
model = Sequential()
embedding_layer = Embedding(vocab_length + 2, 200, input_length=max_length)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Longest tweet: 137





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 137, 200)          18497600  
_________________________________________________________________
flatten_1 (Flatten)          (None, 27400)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 27401     
Total params: 18,525,001
Trainable params: 18,525,001
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
# fit and evaluate the model
model.fit(X_train_padded, y_train, batch_size=32, epochs=3, verbose=1, validation_split=0.2)
loss, accuracy = model.evaluate(X_train_padded, y_train, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Train on 130550 samples, validate on 32638 samples
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 