In [17]:
from scipy.sparse import *
import numpy as np
import pickle
import random
from sklearn.decomposition import PCA
%matplotlib inline
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from operator import itemgetter
from scipy.sparse import csc_matrix as smatrix
import scipy
from scipy.sparse import *
from keras.models import Sequential
from keras.layers import Dense


Using TensorFlow backend.


In [2]:
# load vocabulary
with open('vocab_full.pkl', 'rb') as f:
    vocab = pickle.load(f)
list(vocab.items())[:10]

[('cropper', 65506),
 ('liking', 3224),
 ("bride's", 57532),
 ('hw6500', 95667),
 ('mobster', 83208),
 ('x54', 66729),
 ('hopefullly', 46408),
 ('08:30', 101224),
 ('hahaahaha', 48555),
 ('wishful', 20557)]

In [3]:
# construct num -> word dict
reverse_dictionary = dict(zip(vocab.values(), vocab.keys()))

In [4]:
# file -> [[word_number_1_1, ..., word_number_1_K1], ..., [word_number_L_1, ..., word_number_L_KL]]
def file_to_word2numbers(filename):
    data = open(filename, 'rb')
    word2numbers_all = []
    for line in tqdm(data):
        line = line.strip().decode("utf-8").split(' ')
        word2numbers = []
        for word in line:
            if word in vocab: word2numbers.append(vocab[word])
        if word2numbers:
            word2numbers_all.append(word2numbers)
    return word2numbers_all

In [5]:
# loading data -> numbers of words
pos_numbers = file_to_word2numbers('../data/train_pos_full.txt')
neg_numbers = file_to_word2numbers('../data/train_neg_full.txt')

1250000it [00:12, 99856.78it/s] 
1250000it [00:14, 83873.98it/s]


In [6]:
word_frequency = {}

In [7]:
def word_frequency_update(numbers):
    for i, tweet in tqdm(enumerate(numbers)):
        for number in tweet:
            if number in word_frequency: word_frequency[number] += 1
            else: word_frequency[number] = 1

In [8]:
word_frequency_update(pos_numbers)
word_frequency_update(neg_numbers)

1249957it [00:04, 306819.01it/s]
1249964it [00:05, 231541.87it/s]


In [9]:
# number of word occurences as embeddings (basic embeddings)
def numbers_to_dataset(numbers):
    arr = {}
    for i, tweet in tqdm(enumerate(numbers)):
        for number in tweet:
            p = (i, number)
            if p in arr: arr[p] += 1
            else: arr[p] = 1
                    
    keys = list(arr.keys())
    values = [arr[k] for k in keys]
    return coo_matrix((values, ([x for x, y in keys], [y for x, y in keys])), shape=(len(numbers), len(vocab)))

In [10]:
# applying it to numbers
pos_data = numbers_to_dataset(pos_numbers)
neg_data = numbers_to_dataset(neg_numbers)

1249957it [00:13, 89308.62it/s]
1249964it [00:17, 73399.77it/s]


In [11]:
# constructing X, y pair
def two_datasets_to_one(pos_data, neg_data):
    assert pos_data.shape[1] == neg_data.shape[1]
    X = scipy.sparse.vstack((pos_data, neg_data))
    y = np.array([1] * pos_data.shape[0] + [0] * neg_data.shape[0])
    assert len(y) == X.shape[0]
    assert X.shape[0] == pos_data.shape[0] + neg_data.shape[0]
    assert X.shape[1] == pos_data.shape[1]
    return X, y

In [20]:
# applying to datasets (pos & neg)
X, Y = two_datasets_to_one(pos_data, neg_data)

In [21]:
from sklearn.model_selection import train_test_split
x, x_val, y, y_val = train_test_split(X, Y, test_size=0.01, random_state=42)

In [28]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [39]:
def batch_generator(X, y, batch_size):
    number_of_batches = steps_per_epoch
    counter = 0
    shuffle_index = np.arange(np.shape(y)[0])
    np.random.shuffle(shuffle_index)
    X =  X[shuffle_index, :]
    y =  y[shuffle_index]
    while 1:
        index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[index_batch,:].todense()
        y_batch = y[index_batch]
        counter += 1
        yield(np.array(X_batch),y_batch)
        if (counter >= number_of_batches):
            np.random.shuffle(shuffle_index)
            counter = 0


In [40]:
# logistic regression with L1 and L2 regularization
from keras.regularizers import l2

reg = l2(0.01)

model = Sequential()
model.add(Dense(1, activation='sigmoid', kernel_regularizer=reg, input_dim=x.shape[1]))
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
#model.fit(x, y, nb_epoch=10, validation_data=(x_val, y_val))
batch_size = 1000
nb_epoch = 10
steps_per_epoch = x.shape[0] / batch_size
generator=batch_generator(x, y, batch_size)
model.fit_generator(generator=generator,epochs=nb_epoch, 
                    steps_per_epoch=steps_per_epoch)

Epoch 1/10
   5/2474 [..............................] - ETA: 1:15:48 - loss: 0.6934

KeyboardInterrupt: 

In [None]:
print(clf.score(X, y))

fpr, tpr, _ = roc_curve(y, clf.predict_proba(X)[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
# open train and predict
def test_to_dataset(filename):
    data = open(filename, 'rb')
    idxes = []
    tweets_embeddings = []
    
    for line in tqdm(data):
        idx, line = line.strip().decode("utf-8").split(',', 1)
        idxes.append(idx)
        line = line.split(' ')
        tweet = []
        
        tweet_embeddings = np.zeros((len(vocab), ), dtype=np.float32)
        
        for word in line:
            if word in vocab:
                tweet_embeddings[vocab[word]] += 1
                
        tweets_embeddings.append(tweet_embeddings)
        
    #return tweets_embeddings
    tweets_embeddings = np.array(tweets_embeddings)
    assert len(idxes) == tweets_embeddings.shape[0]
    assert tweets_embeddings.shape[1] == len(vocab)
    return idxes, tweets_embeddings

In [None]:
idx_test, X_test = test_to_dataset('../data/test_data.txt')

In [None]:
y_predicted = np.array(2 * (clf.predict(X_test) - 0.5), dtype=np.int64)

In [None]:
answers = sorted(zip(idx_test, y_predicted), key = lambda x: int(x[0]))

In [None]:
f = open('submission_count_full.txt', 'w')
f.write("Id,Prediction\n")
for idx, ans in answers:
    f.write("%s,%s\n" % (idx, ans))
f.close()