In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torch.utils.data import Dataset, DataLoader
import transformers as T
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import math

In [2]:
dataset = pd.read_csv('/Users/desidero/Desktop/Kodlar/NLP/Psychology/train.csv')
print(dataset.iloc[1,:].values)
print(dataset.iloc[2,:].values)
print(len(dataset))
dataset.head()

["I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?"
 'Hello, and thank you for your question and seeking advice on this. Feelings of worthlessness is unfortunately common. In fact, most people, if not all, have felt this to some degree at some point in their life. You are not alone.\xa0Changing our feelings is like changing our thoughts - it\'s hard to do. Our minds are so amazing that the minute you change your thought another one can be right there to take it\'s place. Without your permission, another thought can just pop in there. The new thought may feel worse than the last one! My guess is that you have tried several things to improve this on your own even before reaching out on here. People often try th

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [3]:
class Preprocess():

    def __init__(self, df_path):
        self.data = self.drop(pd.read_csv(df_path))
        self.context = [self.clean_text(i) for i in self.data.iloc[:, 0].values]
        self.response = [self.clean_text(i) for i in self.data.iloc[:, 1].values]

    def __len__(self):
        return len(self.data)

    def drop(self, df):
        return df.dropna(axis=0)

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r"i'm", "i am", text) # replace "i'm" with "i am"
        text = re.sub(r"im", "i am", text)
        text = re.sub(r"ive", "i have", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"that's", "that is", text)
        text = re.sub(r"what's", "what is", text)
        text = re.sub(r"where's", "where is", text)
        text = re.sub(r"how's", "how is", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"n't", "not", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"wont", "will not", text)
        text = re.sub(r"won t", "will not", text)
        text = re.sub(r"didn't", "did not", text)
        text = re.sub(r"didnt", "did not", text)
        text = re.sub(r"didn t", "did not", text)
        text = re.sub(r"can't", "cannot", text)
        text = re.sub(r"cant", "cannot", text)
        text = re.sub(r"can t", "cannot", text)
        #text = re.sub(r"[-()\"#/@:<>{}+=~|.?,!]", "", text)
        text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,!,...]", "", text)
        return text 
    
path = '/Users/desidero/Desktop/Kodlar/NLP/Psychology/train.csv'
pre = Preprocess(path)
print(pre.context[1])
print(pre.response[1])

i am going through some things with my feelings and myself i barely sleep and i do nothing but think about how i am worthless and how i shouldnot be here
   i have never tried or contemplated suicide i have always wanted to fix my issues but i never get around to it
   how can i change my feeling of being worthless to everyone
hello and thank you for your question and seeking advice on this feelings of worthlessness is unfortunately common in fact most people if not all have felt this to some degree at some point in their life you are not alone changing our feelings is like changing our thoughts  it's hard to do our minds are so amazing that the minute you change your thought another one can be right there to take it's place without your permission another thought can just pop in there the new thought may feel worse than the last one my guess is that you have tried several things to i amprove this on your own even before reaching out on here people often try thinking positi have though

In [4]:
class PsychologyDataset():
    
    def __init__(self, path, tokenizer, max_length):
        self.pre = Preprocess(path)
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.pre.context)
    
    def __getitem__(self, idx):
        text = self.pre.context[idx]
        answer = self.pre.response[idx]
        #encoding = self.tokenizer(text, answer, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')   

        input_encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = input_encoding.input_ids.flatten()
        attention_mask = input_encoding.attention_mask.flatten()
        
        # Tokenize output answer
        output_encoding = self.tokenizer(answer, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        output = output_encoding.input_ids.flatten()
        output_attention_mask = output_encoding.attention_mask.flatten()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'outputs': output,
            'output_attention_mask': output_attention_mask 
        }

In [5]:
batch_size = 32
tokenizer = T.AutoTokenizer.from_pretrained('t5-small')
data = PsychologyDataset(path, tokenizer, max_length=512)
dataloader = DataLoader(data, batch_size=batch_size)

In [6]:
print(tokenizer.vocab_size)
print(int(tokenizer.vocab_size ** 0.5))
print(tokenizer.model_max_length)

32100
179
512


In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [8]:
class TransformerModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(TransformerModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim, max_seq_length=512)
        self.transformer = nn.Transformer(d_model=embedding_dim, nhead=8, num_encoder_layers=6,
                                           num_decoder_layers=6, dim_feedforward=512)
        
        self.fc_out = nn.Linear(embedding_dim, vocab_size)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc_out.bias.data.zero_()
        self.fc_out.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, tgt):
        src_embedded = self.embedding(src) * math.sqrt(self.embedding_dim)
        src_embedded = self.positional_encoding(src_embedded)
        
        tgt_embedded = self.embedding(tgt) * math.sqrt(self.embedding_dim)
        tgt_embedded = self.positional_encoding(tgt_embedded)
        
        output = self.transformer(src_embedded, tgt_embedded)
        output = self.fc_out(output)
        return output

In [9]:
device = torch.device('mps')
#vocab_size = tokenizer.vocab_size
vocab_size = 512
embedding_dim = 80
model = TransformerModel(vocab_size, embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss().to(device)



In [10]:
def train(dataloader, model, optimizer, criterion):
    model.train()
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        response = batch['outputs'].to(device)
        #response_attention_mask = torch.tensor(batch['output_attention_mask']).to(device)
        #print('input_ids: ', input_ids.shape)
        #print('attention_mask: ', attention_mask.shape)
        #print('response: ', response.shape)
        output = model(input_ids, attention_mask)
        loss = criterion(output, response).to(device)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [11]:
for i in range(10):
    train(dataloader, model, optimizer, criterion)

In [12]:
def get_attention_mask(input_ids):
    attention_mask = torch.ones_like(input_ids)
    attention_mask[input_ids == 0] = 0
    return attention_mask

def decode_output(output_tensor, tokenizer, top_k=1):

    output_probs = torch.softmax(output_tensor, dim=-1)
    topk_probs, topk_ids = torch.topk(output_probs, top_k, dim=-1)
    topk_probs = topk_probs.tolist()
    topk_ids = topk_ids.tolist()

    decoded_outputs = []

    # Decode each top-k token ID sequence
    for i in range(len(topk_ids)):
        decoded_tokens = []
        for j in range(len(topk_ids[i])):
            # Decode token ID to text
            decoded_token = tokenizer.decode(topk_ids[i][j])
            decoded_tokens.append((decoded_token, topk_probs[i][j]))
        decoded_outputs.append(decoded_tokens)

    return decoded_outputs


while True:
    
    model.eval()
    user_input = input("You: ")
    if user_input == 'q':
        break
    
    input_ids = tokenizer.encode(user_input, return_tensors='pt')
    input_ids = input_ids.to(device)
    attention_mask = get_attention_mask(input_ids)
    attention_mask = attention_mask.to(device)
    with torch.no_grad():
        output = model(input_ids, attention_mask)

    #print(output)
    output_arr = output.cpu().numpy()
    # Decode and print response
    #print(output_arr)
    #print(output_arr[0])
    response = decode_output(output, tokenizer, top_k=5)
    print("Bot:", response)

Bot: [[('? to indX', [0.01424487866461277, 0.012225417420268059, 0.011082618497312069, 0.010660824365913868, 0.009663427248597145]), ('? in to (d', [0.014583855867385864, 0.013112141750752926, 0.01108396053314209, 0.009704717434942722, 0.009258395060896873]), ('?d inX to', [0.016232678666710854, 0.01329257432371378, 0.01320330798625946, 0.010378205217421055, 0.009773727506399155]), ('? ind toX', [0.016989346593618393, 0.01225390937179327, 0.011101714335381985, 0.010956604033708572, 0.009675339795649052]), ('? ind to’', [0.017202647402882576, 0.01213556807488203, 0.01167673896998167, 0.010288580320775509, 0.009299002587795258])]]
Bot: [[('? ind’l', [0.014129247516393661, 0.011437207460403442, 0.009581104852259159, 0.009494545869529247, 0.008833914063870907]), ('? inld for', [0.014958547428250313, 0.01170134823769331, 0.010144821368157864, 0.009437772445380688, 0.00917124468833208]), ('‘K todayele free', [0.007457095663994551, 0.00733140716329217, 0.006259845104068518, 0.0058254171162843

In [13]:
print(output_arr[0][0])

[-0.01280238  0.30230838  0.05524913  0.44777182  1.1704538   0.7730081
  0.5674996   0.7819571   0.02924786  1.1252514   0.55719393 -0.29350805
  1.0554311   0.1838808   0.60242146  0.580642    1.3858367   0.76135635
  0.99606776  0.80064857  0.40890086  1.0350013   1.0540152   0.73256963
  0.8335948   0.39472473  1.4801785   0.6284341   0.7718112   0.41725338
  0.08415256  0.49672136  0.9954662   0.6130145   0.12179126  0.75483775
  0.19233628  0.81176895  0.32335195  0.3762112   1.0719892   1.0783821
  0.15511681  0.06561427  0.6832792   0.42233047 -0.1137569   0.80704194
 -0.19500998 -0.03225546  0.83643574  0.1820846   1.1188061   0.15590277
 -0.10242783  0.09076809 -0.5321743   0.00586918  1.5630016   0.24081416
  0.10386963  0.18305212  0.88215286 -0.1388364   0.3993758  -0.03833205
  0.64150435  0.418624    0.14951214  0.53025967  0.3857232   0.22231461
  0.4586855   0.57853955 -0.06118084  0.29849523 -0.32207653 -0.13321657
 -0.36535296 -0.06709275 -0.09200664 -0.15544777  0.0

In [14]:
a = tokenizer.encode("men")
print(a)
b = tokenizer.decode(1076)
print(b)

[1076, 1]
men
