In [None]:
!pip install googledrivedownloader;
from google_drive_downloader import GoogleDriveDownloader as gdd

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt



In [None]:
def tokenize(sentences):

    tokenized_senteces = list()
    for s, sent in enumerate(sentences):

        remove_punctuation = RegexpTokenizer(r"\w+")
        tokenized_sent = remove_punctuation.tokenize(sent)

        for i, t in enumerate(tokenized_sent):
            if t.isnumeric():
                tokenized_sent[i] = "<num>"

        tokenized_senteces.append(tokenized_sent)

    return tokenized_senteces


def token_filter(tokenized_sentences, thresh=5):
    
    words = list()
    for t_sent in tokenized_sentences:
        for word in t_sent:
            words.append(word)

    vocab = Vocabulary(words, unk_cutoff=thresh)
    print("Length of vocab: {}".format(len(vocab)))
    filtered_sentences = list()

    for t, tokenized_sent in enumerate(tokenized_sentences):
        filtered_sent = list()
        for word in tokenized_sent:
            if vocab.lookup(word) == '<UNK>':
                filtered_sent.append('<unk>')
            else: 
                filtered_sent.append(word)
        filtered_sentences.append(filtered_sent)

    return filtered_sentences

In [None]:
!pip install fasttext
import fasttext
import pandas as pd

def Skipgram(filtered_sentences, ws=3, dim=50):
    model = fasttext.train_unsupervised(filtered_sentences, model='skipgram', ws=ws, dim=dim, neg=5)
    vocab_dict = {} 
    vocab_dict = {word:idx for (idx, word) in enumerate(model.get_words())}
    return model




Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 13.6MB/s eta 0:00:01[K     |█████████▌                      | 20kB 17.2MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 10.2MB/s eta 0:00:01[K     |███████████████████             | 40kB 9.1MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 5.4MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 6.0MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.4MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3092705 sha256=1f7ccfcd116b8f17520c7516a2f7c9ab55f9d1a820779ac360062451e5057d74
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c1

In [None]:
## Download pre-processed data
gdd.download_file_from_google_drive(file_id='1PUiB33hgTsefasb3D2t920Gu2mINLd4C', dest_path='../preprocessed_data/preprocessed_data.zip', unzip=True)
!rm ../preprocessed_data/preprocessed_data.zip



trn = pd.read_csv('../preprocessed_data/trn_title.csv', delimiter = ',', names=['title','label'])
tst = pd.read_csv('../preprocessed_data/tst_title.csv', delimiter = ',', names=['title','label'])

Downloading 1PUiB33hgTsefasb3D2t920Gu2mINLd4C into ../preprocessed_data/preprocessed_data.zip... Done.
Unzipping...Done.


In [None]:
print(trn.shape)
print(tst.shape)

(30986, 2)
(4428, 2)


In [None]:
!pip install nltk==3.6

import math
import pickle
import urllib.request
from tqdm import tqdm
from os.path import isfile

import nltk
nltk.download("punkt")
from nltk import RegexpTokenizer
from nltk.lm import Vocabulary

trn_sentences = tokenize(list(trn.title))
trn_filtered_sentences = token_filter(trn_sentences,5)

tst_sentences = tokenize(list(tst.title))
tst_filtered_sentences = token_filter(tst_sentences,5)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Length of vocab: 9445
Length of vocab: 2106


In [None]:
print(trn_sentences[0])

['WATCH', 'Louis', 'C', 'K', 'NAILS', 'Trump', 'And', 'His', 'unk', 'Supporters', 'During', 'Appearance', 'On', 'Colbert']


In [None]:
with open('../preprocessed_data/skipgram_train_lines.txt', 'w') as f:
    for title in trn_filtered_sentences:
        for w in title: 
          f.write("%s " % w)
        f.write("\n")

with open('../preprocessed_data/skipgram_test_lines.txt', 'w') as f:
    for title in tst_filtered_sentences:
        for w in title: 
          f.write("%s " % w)
        f.write("\n")


skipgram_model = Skipgram('../preprocessed_data/skipgram_train_lines.txt')

In [None]:
f1 = open("../preprocessed_data/skipgram_train_lines.txt", "r")
f2 = open("../preprocessed_data/skipgram_test_lines.txt", "r")

trn_embeddings = []
tst_embeddings = [] 

for line in f1:
    line_stripped = line.strip()  
    trn_embeddings.append(skipgram_model.get_sentence_vector(line_stripped))

f1.close()


for line in f2:
    line_stripped = line.strip()
    tst_embeddings.append(skipgram_model.get_sentence_vector(line_stripped))

f = open("../preprocessed_embeddings/skipgram_train_embeddings.pkl","wb")
pickle.dump(trn_embeddings,f)
f.close()

f = open("../preprocessed_embeddings/skipgram_test_embeddings.pkl","wb")
pickle.dump(tst_embeddings,f)
f.close()