In [51]:
!pip install emoji



In [52]:
import re
from nltk import word_tokenize
import emoji
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import one_hot
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
import numpy as np

In [53]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [54]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [55]:
paths = ["./pickup_lines_all.txt",]
for path in paths:
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        raw_txt = f.read()

    # with open("./text.txt", 'a', encoding='utf-8', errors='ignore') as t:
    #     for txt in raw_txt.splitlines():
    #         t.write(txt)
    #         t.write('\n')

In [56]:
def load_interjection(path):
  interjection = []
  with open(path, 'r', encoding='utf-8', errors = 'ignore') as f:
    txt = f.read().splitlines()
    for word in txt:
      interjection.append(word)
      return interjection

In [57]:
def clean_text(text):
    # 1. Remove emojis
    text = emoji.replace_emoji(text, replace='')

    # 2. Remove sequences of multiple dots like ... or .... etc
    text = re.sub(r'\.', '', text)

    # 3. Remove all punctuation except single full stops
    # Keep letters, numbers, spaces, and single periods
    text = re.sub(r'[^\w\s.]', '', text)

    # 4. Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # 5. Tokenize
    tokens = word_tokenize(text)

    return tokens

In [58]:
txt = open("./pickup_lines_all.txt", 'r', encoding='utf-8', errors='ignore').read().split('\n')

In [59]:
tokenized_txt = []
for line in txt:
    tokenized_txt.append(clean_text(line))

In [60]:
all_words = []
for sentence in tokenized_txt:
  for word in sentence:
    all_words.append(word.lower())
all_words= set(all_words)

In [61]:
word_dict = {}
for i, word in enumerate(sorted(all_words)):
  word_dict[word] = i+1
i_to_w = {v: k for k, v in word_dict.items()}

In [62]:
len(word_dict)

2064

In [63]:
word_dict

{'1': 1,
 '10': 2,
 '100': 3,
 '101': 4,
 '105': 5,
 '106': 6,
 '118': 7,
 '12': 8,
 '120': 9,
 '122': 10,
 '17': 11,
 '19': 12,
 '20': 13,
 '2025': 14,
 '2025read': 15,
 '25': 16,
 '27': 17,
 '3': 18,
 '30': 19,
 '31': 20,
 '33': 21,
 '45degree': 22,
 '4th': 23,
 '5': 24,
 '50': 25,
 '6': 26,
 '60': 27,
 '65': 28,
 '7': 29,
 '70': 30,
 '73': 31,
 '75': 32,
 '83': 33,
 '90': 34,
 '90s': 35,
 '96': 36,
 'a': 37,
 'abducted': 38,
 'ablaze': 39,
 'able': 40,
 'about': 41,
 'abraham': 42,
 'absolutely': 43,
 'ace': 44,
 'aces': 45,
 'aching': 46,
 'across': 47,
 'activity': 48,
 'acts': 49,
 'actually': 50,
 'acute': 51,
 'acutie': 52,
 'add': 53,
 'additionally': 54,
 'admire': 55,
 'admire270': 56,
 'admirelove': 57,
 'admirely': 58,
 'admirer': 59,
 'adopt': 60,
 'adore': 61,
 'adore270': 62,
 'adorelove': 63,
 'adorely': 64,
 'adventure': 65,
 'adventurous': 66,
 'af': 67,
 'affection': 68,
 'affectionsand': 69,
 'africa': 70,
 'after': 71,
 'afterward': 72,
 'again': 73,
 'against': 7

In [64]:
num_representation  = []
for sentence in tokenized_txt:
  num_sent = []
  for word in sentence:
    num_sent.append(word_dict[word.lower()])
  num_representation.append(num_sent)

In [65]:
num_representation[:2]

[[528,
  2053,
  820,
  37,
  1306,
  180,
  1533,
  2053,
  817,
  758,
  1117,
  37,
  2000,
  1207,
  1015,
  1238,
  1029],
 [1537,
  1117,
  37,
  1396,
  538,
  1231,
  2058,
  1666,
  103,
  906,
  19,
  1529,
  897,
  528,
  1794,
  1502,
  99,
  1969,
  297,
  387,
  1257,
  1106]]

In [66]:
input_sequence = []
for sentence in num_representation:
  for i in range(1, len(sentence)):
    input_sequence.append(sentence[0:i+1])
input_sequence[:6]

[[528, 2053],
 [528, 2053, 820],
 [528, 2053, 820, 37],
 [528, 2053, 820, 37, 1306],
 [528, 2053, 820, 37, 1306, 180],
 [528, 2053, 820, 37, 1306, 180, 1533]]

In [67]:
max_len = max([len(x) for x in input_sequence ])

In [68]:
max_len

40

In [69]:
input_sequence_tensor= [torch.tensor(seq) for seq in input_sequence]
padded_input_sequence = pad_sequence(input_sequence_tensor, padding_side="left", padding_value=0, batch_first=True)

In [70]:
padded_input_sequence.shape

torch.Size([124905, 40])

In [71]:
padded_input_sequence[0]

tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,  528, 2053])

In [72]:
padded_input_sequence.shape

torch.Size([124905, 40])

In [73]:
X = padded_input_sequence[:, :-1]
y = padded_input_sequence[:, -1:]
X.shape

torch.Size([124905, 39])

In [74]:
y = torch.squeeze(one_hot(y, num_classes=len(word_dict)+1))
y.shape

torch.Size([124905, 2065])

In [75]:
y[0]

tensor([0, 0, 0,  ..., 0, 0, 0])

In [76]:
class mydata(Dataset):
  def __init__(self, X, y):
    super().__init__()
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [77]:
data = mydata(X, y)
batched_data = DataLoader(data, batch_size=128, shuffle=True)

In [78]:
class next_word_predictor(nn.Module):
  def __init__(self):
    super().__init__()
    self.embd = nn.Embedding(len(word_dict)+1, 100)
    self.do = nn.Dropout(0.2)
    self.lstm = nn.LSTM(input_size=100, hidden_size=150, num_layers=2, batch_first=True)
    self.fc1 = nn.Linear(150, len(word_dict)+1)
    self.fc2 = nn.Linear(len(word_dict)+1, len(word_dict)+1)

    # self.sf = nn.Softmax(dim = 1)


  def forward(self, x):
    x = self.embd(x)
    x = self.do(x)
    h, c = self.lstm(x)
    y = self.fc1(h[:, -1, :])
    y = self.fc2(y)
    # y = self.sf(y)
    return(y)

In [79]:
model = next_word_predictor()
model.to(device)
model.eval()

next_word_predictor(
  (embd): Embedding(2065, 100)
  (do): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(100, 150, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=150, out_features=2065, bias=True)
  (fc2): Linear(in_features=2065, out_features=2065, bias=True)
)

In [80]:
def train(epochs = 50):
  model.train()
  loss_fn = nn.CrossEntropyLoss()
  optim = torch.optim.Adam(model.parameters(), lr=0.001)

  for epoch in range(epochs):
    for batch in batched_data:
      losses = []
      X, y = batch
      X, y = X.to(device), y.to(device)
      y_pred = model(X)

      loss = loss_fn(y_pred, torch.argmax(y, dim=1))

      loss.backward()
      losses.append(loss.item())
      optim.step()
      optim.zero_grad()

    print(f"Epoch: {epoch}, Loss: {np.mean(losses)}")

In [81]:
train()

Epoch: 0, Loss: 0.792807936668396
Epoch: 1, Loss: 0.7821967601776123
Epoch: 2, Loss: 0.8900249600410461
Epoch: 3, Loss: 0.7868149280548096
Epoch: 4, Loss: 0.7283309698104858
Epoch: 5, Loss: 0.5019057393074036
Epoch: 6, Loss: 0.7960450649261475
Epoch: 7, Loss: 0.39992251992225647
Epoch: 8, Loss: 0.6067790985107422
Epoch: 9, Loss: 0.5627151131629944
Epoch: 10, Loss: 0.5698567628860474
Epoch: 11, Loss: 0.5483987331390381
Epoch: 12, Loss: 0.3848839998245239
Epoch: 13, Loss: 0.6082321405410767
Epoch: 14, Loss: 0.5442816019058228
Epoch: 15, Loss: 0.5714972019195557
Epoch: 16, Loss: 0.6952379941940308
Epoch: 17, Loss: 0.5653643608093262
Epoch: 18, Loss: 0.36873525381088257
Epoch: 19, Loss: 0.3148738741874695
Epoch: 20, Loss: 0.4792240560054779
Epoch: 21, Loss: 0.612885057926178
Epoch: 22, Loss: 0.6372615098953247
Epoch: 23, Loss: 0.5257134437561035
Epoch: 24, Loss: 0.500193178653717
Epoch: 25, Loss: 0.3640625774860382
Epoch: 26, Loss: 0.5971618890762329
Epoch: 27, Loss: 0.4665842652320862
Epo

In [90]:
word = 'girl'
num_word = word_dict[word]
num_word = torch.tensor(num_word).to(device)
num_word = num_word.unsqueeze(0)

In [91]:
import torch.nn.functional as F

In [92]:
padded_word = (F.pad(num_word, (max_len-(len(num_word)+1), 0), value=0)).reshape(1, 39)
padded_word.shape

torch.Size([1, 39])

In [93]:
pred = model(padded_word)

In [94]:
pred.shape

torch.Size([1, 2065])

In [95]:
torch.argmax(pred)


tensor(122, device='cuda:0')

In [96]:
print(i_to_w[torch.argmax(pred).item()])

are


In [98]:
import time
sentence = 'why you'
for i in range(3):
  sent_word = []
  for words in sentence.split(' '):
    num_word = word_dict[words]
    sent_word.append(num_word)

    padded_word = (F.pad(torch.tensor(sent_word).to(device), (max_len-(len(sent_word)+1), 0), value=0)).reshape(1, 39)
    pred = model(padded_word)
    time.sleep(1)
    sentence = sentence + " " + (i_to_w[torch.argmax(pred).item()])
    print(sentence)

why you pay
why you pay know
why you pay know pay
why you pay know pay seen
why you pay know pay seen for
why you pay know pay seen for are
why you pay know pay seen for are pay
why you pay know pay seen for are pay have
why you pay know pay seen for are pay have to
why you pay know pay seen for are pay have to a
why you pay know pay seen for are pay have to a from
why you pay know pay seen for are pay have to a from you
why you pay know pay seen for are pay have to a from you me
why you pay know pay seen for are pay have to a from you me we
