In [None]:
import re, string, json, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm # print progress bar

import nltk
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors, FastText
from PretrainedModel import FastSpelling, get_embedding

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE, ADASYN

In [None]:
from keras.models import Model, load_model
from keras.layers import Input, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, \
                        Concatenate, Dense, SpatialDropout1D, Bidirectional, GRU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import activations
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
folder_path = '/home/babeau/Documents/Altergrad/Pretrained_models'

# .bin file
google_300 = {'name': 'google_300', 
              'init_dimension': 300, 
              'file': folder_path + '/GoogleNews-vectors-negative300.bin.gz'}

# .txt file
twitter_200 = {'name': 'twitter_200', 
               'init_dimension': 200, 
               'file': folder_path + '/glove.twitter.27B/glove.twitter.27B.200d.txt'}
twitter_100 = {'name': 'twitter_100', 
               'init_dimension': 100, 
               'file': folder_path + '/glove.twitter.27B/glove.twitter.27B.100d.txt'}
twitter_50 = {'name': 'twitter_50', 
              'init_dimension': 50, 
              'file': folder_path + '/glove.twitter.27B/glove.twitter.27B.50d.txt'}
twitter_25 = {'name': 'twitter_25', 
              'init_dimension': 25, 
              'file': folder_path + '/glove.twitter.27B/glove.twitter.27B.25d.txt'}
glove_840B_300 = {'name': 'glove_840B_300', 
                 'init_dimension': 300, 
                 'file': folder_path + '/glove.840B.300d.txt'}
glove_6B_300 = {'name': 'glove_6B_300', 
               'init_dimension': 300, 
               'file': folder_path + '/glove.6B/glove.6B.300d.txt'}

# .vec file
wiki_300 = {'name': 'wiki_300', 
            'init_dimension': 300, 
            'file': folder_path + '/wiki-news-300d-1M.vec'}
crawl_300 = {'name': 'crawl_300', 
             'init_dimension': 300, 
             'file': folder_path + '/crawl-300d-2M.vec'}

In [None]:
tokenizer = Tokenizer(num_words=1e5)
train_text = data['comment_text']

tokenizer.fit_on_texts(train_text.values)
train_sequences = tokenizer.texts_to_sequences(train_text)

In [None]:
tokenizer.

# PREPROCESSING

In [None]:
data = pd.read_csv('train.csv')
data.head()

In [None]:
x = [text for text in data['comment_text']]
y = data[[col for col in data.columns if col != 'id' and col != 'comment_text' ]]

In [None]:
with open('train.json', 'r') as file:
    vectorized_tokens = json.load(file)
with open('vocab2index.json', 'r') as file:
    vocab2index = json.load(file)

In [None]:
pretrained_models = [wiki_300, google_300, twitter_200, glove_840B_300, crawl_300]

embeddings_dict = dict()
for i in tqdm(range(len(pretrained_models))):
    embeddings_dict = dict()
    embeddings_dict[pretrained_models[i]['name']] = get_embedding(pretrained_models[i], vocab2index).tolist()
    with open('embeddings' + pretrained_models[i]['name'] + '.json', 'w') as file:
        json.dump(embeddings_dict, file)

In [None]:
with open('embeddings.json', 'r') as file:
    dic = json.load(file)

# MODEL

In [None]:
nb_branches = 2
nb_filters = 150
filter_sizes = [3,4]
drop_rate = 0.2 # amount of dropout regularization
batch_size = 512
nb_epochs = 6
my_optimizer = 'adam'
my_patience = 2 # for early stopping strategy

max_size = len(vectorized_tokens[1])
pretrained_model = wiki_300
embeddings = get_embedding(pretrained_model, vocab2index)
print('Done embedding')

In [None]:
import umap
embeddings = umap.UMAP(n_components=64, n_neighbors=30, min_dist=0.0, random_state=42).fit_transform(embeddings)

In [None]:
# = = = = = defining architecture = = = = =
def cnn_branch(n_filters,k_size,d_rate,my_input):
    return Dropout(d_rate)(GlobalMaxPooling1D()(Conv1D(filters=n_filters,
                                                       kernel_size=k_size,
                                                       activation='relu')(my_input)))

doc_ints = Input(shape=(None,))
doc_wv = Embedding(input_dim= embeddings.shape[0], # vocab size
                  output_dim= embeddings.shape[1], # dimension of embedding
                  weights = [embeddings],
                  input_length=max_size,
                  trainable = False,
                  )(doc_ints)
doc_wv_dr = SpatialDropout1D(drop_rate)(doc_wv)
doc_wv_dr = Bidirectional(GRU(nb_filters, return_sequences=True))(doc_wv_dr)
branch_outputs = [cnn_branch(nb_filters, filter_sizes[idx], drop_rate, doc_wv_dr) \
                  for idx in range(nb_branches)]
concat = Concatenate()(branch_outputs)
preds = Dense(units=6, activation='sigmoid')(concat)

model = Model(doc_ints, preds)
model.compile(loss='binary_crossentropy',
              optimizer = my_optimizer,
              metrics = ['accuracy'])
model.summary()

In [None]:
# = = = = = training = = = = =
path_to_data = '/home/babeau/Downloads/all/'

early_stopping = EarlyStopping(monitor='val_acc', # go through epochs as long as accuracy on validation set increases
                               patience=my_patience,
                               mode='max')

checkpointer = ModelCheckpoint(filepath=path_to_data + 'model_' + pretrained_model['name'], 
                               verbose=1, 
                               save_best_only=True)
x_train, x_test, y_train, y_test = train_test_split(np.array(vectorized_tokens), np.array(y), \
                                                    test_size = 0.25, random_state = 100)
model.fit(x_train,
         y_train,
         batch_size=batch_size,
         epochs=nb_epochs,
         validation_data=(x_test, y_test),
         callbacks=[early_stopping, checkpointer])

In [None]:
preds = model.predict(x_test)
mean_score = np.mean([roc_auc_score(y_test[:,i], preds[:,i]) for i in range(preds.shape[1])])
mean_score

google 300: 0.9449
glove 300: 0.9542

- clear text
- tokenize
- check if word is too long (otherwise spelling- correction function will run forever): look at words containing more than xxx characters and decide where is the cut off.
- apply spelling- correction over vocabulary
- vectorize 

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from ripser import Rips
from persim import PersImage

In [2]:
N = 200
N_per_class = int(N / 2)
N_in_class = 400

def noise(N, scale):
    return scale * np.random.random((N, 2))

def circle(N, scale, offset):
    return offset + scale * datasets.make_circles(n_samples=N, factor=0.4, noise=0.05)[0]
    
just_noise = [noise(N_in_class, 150) for _ in range(N_per_class)]

half = int(N_in_class / 2)
with_circle = [np.concatenate((circle(half, 50, 70), noise(half, 150)))
               for _ in range(N_per_class)]

datas = []
datas.extend(just_noise)
datas.extend(with_circle)

# Define labels
labels = np.zeros(N)
labels[N_per_class:] = 1

In [None]:
dat = np.random.random((100, 300))

In [None]:
datas = np.array(datas)

In [None]:
rips = Rips(maxdim=1, coeff=2)
diagrams = [rips.fit_transform(data) for data in datas]
diagrams_h1 = [rips.fit_transform(data)[1] for data in datas]

In [7]:
import gudhi as gd

In [36]:
def pd(data,max_dim):
    
    rips_complex = gd.RipsComplex(data)
    Rips_simplex_tree_sample = rips_complex.create_simplex_tree(max_dimension=(max_dim+1)) 
    Rips_simplex_tree_sample.persistence()
    diag_Rips = Rips_simplex_tree_sample.persistence_intervals_in_dimension(max_dim)
    return np.array(diag_Rips)

In [4]:
rips = Rips(maxdim=1, coeff=2, verbose=False)

#%timeit rips_complex = rips.fit_transform(datas[0])[1]

In [37]:
%timeit diag = pd(datas[0], 1)

7.69 s ± 490 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
pim = PersImage(pixels=[20,20], spread=1)

PersImage(pixels=[20, 20], spread=1, specs=None, kernel_type="gaussian", weighting_type="linear")


In [5]:
rips_complex = rips.fit_transform(datas[0])[1]
rips_complex

array([[15.65368748, 15.91078663],
       [15.65347672, 16.11473083],
       [15.58119869, 21.68674088],
       [15.02328205, 15.3736515 ],
       [14.58504009, 17.22622108],
       [14.17356873, 14.49266815],
       [13.79939175, 13.86249638],
       [13.65787315, 20.46810722],
       [13.48520184, 17.78808022],
       [13.43519592, 17.19465637],
       [13.17404366, 14.29042912],
       [13.08502197, 14.26892853],
       [13.03124142, 17.8809433 ],
       [12.99820042, 16.16630745],
       [12.95693588, 13.14085865],
       [12.89973259, 12.93458271],
       [12.74729729, 13.86171818],
       [12.52601433, 14.26108265],
       [12.43240738, 14.25903225],
       [11.92043018, 13.24260712],
       [11.9102602 , 13.34253597],
       [11.82876015, 13.01293373],
       [11.82090473, 15.63753986],
       [11.71821022, 17.03198433],
       [11.70853615, 15.48952007],
       [11.69644451, 14.64228058],
       [11.59950733, 15.00278854],
       [11.52294922, 12.39854622],
       [11.46739388,

In [17]:
x = np.random.random((1000, 30))
y = np.random.random((100, 30))

In [18]:
%timeit dat = rips.fit_transform(x)

3.34 s ± 425 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit dat = rips.fit_transform(y)

21.1 ms ± 1.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [33]:
diag = pd(datas[0], 1)

In [34]:
diag

array([[ 5.6655569 ,  5.74572271],
       [ 6.01805092,  6.15403189],
       [ 6.60014513,  7.23648071],
       [ 6.67489344,  7.32029215],
       [ 7.24392265,  7.54282417],
       [ 6.7607228 ,  7.64173608],
       [ 7.80523457,  7.90956272],
       [ 6.47668362,  7.95961408],
       [ 7.82619119,  8.24530201],
       [ 7.7756888 ,  8.5282489 ],
       [ 6.34498052,  8.99610867],
       [ 7.86844807,  9.02337022],
       [ 8.07536171,  9.10903666],
       [ 8.3220505 ,  9.2057855 ],
       [ 8.8956877 ,  9.23766073],
       [ 9.07963183,  9.40679611],
       [ 7.9629731 ,  9.41500466],
       [ 8.60510196,  9.53078197],
       [ 7.59272934,  9.63921669],
       [ 8.49583276,  9.64301434],
       [ 9.46118947,  9.78634409],
       [ 8.66939703,  9.94430868],
       [ 9.5716821 , 10.0291838 ],
       [ 9.91722801, 10.30908878],
       [ 8.88929172, 10.37045094],
       [ 7.77722712, 10.43154981],
       [ 9.84423702, 10.47531291],
       [ 9.72935217, 10.50455347],
       [ 9.5518981 ,