In [134]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from nltk.tokenize import TweetTokenizer, word_tokenize
import re
from collections import defaultdict

# Find Hyperlinks in string

In [121]:
def findUrl(string):
  
    # findall() has been used 
    # with valid conditions for urls in string
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    found = re.search(regex, string)
    return found

# Data Loader / Tokenizer

In [147]:
train_data = pd.read_csv('../train.csv')
tokenizer = TweetTokenizer()
hashtag = True
wordcount = defaultdict(int)
vocab_size = 500

lines = []
maxlen = 0
for data in train_data['Tweet']:

    line = ['<START>']

    tokens = tokenizer.tokenize(data.lower())

    for token in tokens:
        url = findUrl(token)
        if url:
            line.append('<URL>')
            wordcount['<URL>'] += 1
        elif token[0] == '#':
            if hashtag:
                line.append(token)
                wordcount[token] += 1
            else:
                line.append('<HASH>')
                wordcount['<HASH>'] += 1
        else:
            more_words = word_tokenize(token)
            for w in more_words:
                line.append(w)
                wordcount[w] += 1

    line.append('<END>')
    maxlen = max(maxlen, len(line))
    lines.append(line)

wordcount['<START>'] = len(train_data['Tweet'])
wordcount['<END>'] = len(train_data['Tweet'])

sorted_wordcounts = sorted(wordcount.items(), key = lambda item: item[1], reverse=True)

word2ind = {}
ind2word = {}

ind = 1
for k, v in sorted_wordcounts[:vocab_size - 1]:
    word2ind[k] = ind
    ind2word[ind] = k
    ind += 1

for k, v in sorted_wordcounts[vocab_size - 1:]:
    word2ind[k] = vocab_size
    ind2word[vocab_size - 1] = '<UKN>'

X = []

for line in lines:
    ind_line = []
    for word in line:
        ind_line.append(word2ind[word])
    
    if len(ind_line) < maxlen:
        ind_line += [0] * (maxlen - len(ind_line))
    
    X.append(ind_line)


In [148]:
print(lines[0])
print(X[0])


['<START>', 'it', "'s", 'the', 'everything', 'else', 'that', "'s", 'complicated', '.', '#pesummit', '#pxpic', '<URL>', '<END>']
[1, 26, 21, 5, 354, 500, 29, 21, 500, 4, 500, 500, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


list