In [None]:
# PyTorch modules
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Regular expressions
import re
import string

# Spacy modules
import spacy

import os
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report

from joblib import dump, load

from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
SEED = 2024

In [None]:
vocab_size = 3000

In [None]:
STEAM_REVIEWS_PATH = '/content/drive/MyDrive/Eletivas/PLN/steam_reviews/dataset.csv'

In [None]:
SVM_MODEL_PATH    = '/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/modelos/svm.joblib'
CNN_MODEL_PATH    = '/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/modelos/cnn.keras'
LSTM_MODEL_PATH   = '/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/modelos/lstm.keras'
DSBERT_MODEL_PATH = '/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/modelos/transformer/'

In [None]:
custom_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself',
'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
'with', 'against','into', 'through', 'above', 'below', 'to', 'up', 'down','out', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
'here', 'there', 'all', 'any', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
't', 'just', "should've", 'd', 'll', 'm', 'o', 're', 've',
 'ain', 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'ma', 'mightn',
 "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't"]

In [None]:
!pip install spacy



In [None]:
nlp = spacy.load('en_core_web_sm')

### Loading dataset

In [None]:
def remove_leading_spaces_hyphens(text):
    # Define a regular expression pattern to match leading spaces and hyphens
    pattern = r'^[\s-]+'

    # Use re.sub() to replace matches of the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

def remove_parentheses(text):
    # Define a regular expression pattern to match quotes, parentheses, and their contents
    pattern = r"[\"']?\([^)]*\)[\"']?"

    # Use re.sub() to replace matches of the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

In [None]:
def tokenize(text, nlp_model, stopwords):
    # Tokenize the text using the provided spaCy model
    text = text.lower().strip()
    text = re.sub(r'[^\w\s]', '', text) # remove special characters

    doc = nlp_model(text)

    # Initialize an empty list to store preprocessed tokens
    preprocessed_tokens = []

    # Iterate through each token in the document
    for token in doc:
        # Check if the token represents a number
        if token.like_num:
            # Replace numbers with '<NUM>'
            preprocessed_tokens.append('<NUM>')
        # Check if the token is alphabetic
        elif token.is_punct:
            # Ignore punctuation tokens
            continue
        # Check if the lowercase token text is in custom_stopwords
        elif token.text.lower() in custom_stopwords:
            # Ignore stopwords
            continue
        elif len(token.text.strip()) == 0:
            # Ignore whitespaces
            continue
        else:
            # Lemmatize the token and append to preprocessed_tokens
            preprocessed_tokens.append(token.lemma_)

    return preprocessed_tokens

### Concatenate with extractor dataset

In [None]:
extrator_req_game_info = pd.read_csv(r'/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/Extrator/datasets/train.csv')
extrator_req_game_info = extrator_req_game_info.sample(int(len(extrator_req_game_info) * 0.06), random_state=SEED)
extrator_req_game_info['utterance'] = extrator_req_game_info['phrase']
extrator_req_game_info['intent'] = 'ReqShowGame'
extrator_req_game_info = extrator_req_game_info.reset_index()
extrator_req_game_info = extrator_req_game_info[['utterance', 'intent']]
extrator_req_game_info

Unnamed: 0,utterance,intent
0,Show me Psychedelic games that are Management.\n,ReqShowGame
1,Show me Baseball games with Pixel Graphics.\n,ReqShowGame
2,Show me Nature games with Cycling and Strategy.\n,ReqShowGame
3,Can you recommend some Replay Value Mining gam...,ReqShowGame
4,Any Web Publishing Open World games with PvE g...,ReqShowGame
...,...,...
12235,"I'm into Hack and Slash games, especially Sub-...",ReqShowGame
12236,I'm a fan of Makivision Games. Can you recomme...,ReqShowGame
12237,What are the top 4X games on steamOS that have...,ReqShowGame
12238,Show me Benchmark Space games with a Horror se...,ReqShowGame


In [None]:
test_extrator_req_game_info = pd.read_csv(r'/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/Extrator/datasets/test.csv')
test_extrator_req_game_info = test_extrator_req_game_info.sample(int(len(test_extrator_req_game_info) * 0.02), random_state=SEED)
test_extrator_req_game_info['utterance'] = test_extrator_req_game_info['phrase']
test_extrator_req_game_info['intent'] = 'ReqShowGame'
test_extrator_req_game_info = test_extrator_req_game_info.reset_index()
test_extrator_req_game_info = test_extrator_req_game_info[['utterance', 'intent']]
test_extrator_req_game_info

Unnamed: 0,utterance,intent
0,"Any Nature games with a Platformer style, suit...",ReqShowGame
1,What's new in linux Bowling games with 1990's ...,ReqShowGame
2,Show me Zombies Documentary games with Conspir...,ReqShowGame
3,What are the newest Choose Your Own Adventure ...,ReqShowGame
4,Show me linux Mod games with Sandbox support r...,ReqShowGame
...,...,...
1015,What Movie games have the best Mars art?\n,ReqShowGame
1016,List games with the categories RTS and Sports.\n,ReqShowGame
1017,Any windows Sports games with Feature Film lau...,ReqShowGame
1018,I need Party Game Cartoony games with Turn-Bas...,ReqShowGame


In [None]:
classificator_train_data = pd.read_csv(r'/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/dataset_v4.csv')
classificator_train_data = classificator_train_data[['utterance','intent']]
classificator_train_data

Unnamed: 0,utterance,intent
0,I want to buy the game Skyrim.,BuyGame
1,I'm interested in buying Grand Theft Auto V.,BuyGame
2,I'm looking to purchase Red Dead Redemption 2.,BuyGame
3,I want to buy a game. The title is The Witcher...,BuyGame
4,I want to buy Assassin's Creed Odyssey.,BuyGame
...,...,...
16944,"Interested in Babycar Driver, do you have it a...",BuyGame
16945,"Interested in Zup! S, do you have it available?",BuyGame
16946,"Interested in Gurgamoth, do you have it availa...",BuyGame
16947,"Interested in BOXVR, do you have it available?",BuyGame


In [None]:
classificator_test_data = pd.read_csv(r'/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/test_dataset_v4.csv')
classificator_test_data = classificator_test_data[['utterance','intent']]
classificator_test_data

Unnamed: 0,utterance,intent
0,"Yeah, let's get this game!",affirm
1,"Of course, I'm in!",affirm
2,"Definitely, can't wait to play it!",affirm
3,"Yep, I'm down!",affirm
4,"Absolutely, I've been wanting this game for a ...",affirm
...,...,...
1138,"Hello, who's the developer of Among Us? Thanks...",ReqGameInfo
1139,"Hey, what's the genre of Among Us? Thanks a lo...",ReqGameInfo
1140,When did Minecraft was released?,ReqGameInfo
1141,Do you know who published Hi-Fi Rush?,ReqGameInfo


In [None]:
train_data = pd.concat([classificator_train_data, extrator_req_game_info])
train_data = train_data.reset_index().drop(columns=['index'])
train_data

Unnamed: 0,utterance,intent
0,I want to buy the game Skyrim.,BuyGame
1,I'm interested in buying Grand Theft Auto V.,BuyGame
2,I'm looking to purchase Red Dead Redemption 2.,BuyGame
3,I want to buy a game. The title is The Witcher...,BuyGame
4,I want to buy Assassin's Creed Odyssey.,BuyGame
...,...,...
29184,"I'm into Hack and Slash games, especially Sub-...",ReqShowGame
29185,I'm a fan of Makivision Games. Can you recomme...,ReqShowGame
29186,What are the top 4X games on steamOS that have...,ReqShowGame
29187,Show me Benchmark Space games with a Horror se...,ReqShowGame


In [None]:
train_data = train_data.drop_duplicates()

In [None]:
train_data['intent'].value_counts()

ReqShowGame    12436
ReqGameInfo     8794
BuyGame         7406
affirm           251
deny             241
Name: intent, dtype: int64

In [None]:
test_data = pd.concat([classificator_test_data, test_extrator_req_game_info])
test_data = test_data.reset_index().drop(columns=['index'])
test_data

Unnamed: 0,utterance,intent
0,"Yeah, let's get this game!",affirm
1,"Of course, I'm in!",affirm
2,"Definitely, can't wait to play it!",affirm
3,"Yep, I'm down!",affirm
4,"Absolutely, I've been wanting this game for a ...",affirm
...,...,...
2158,What Movie games have the best Mars art?\n,ReqShowGame
2159,List games with the categories RTS and Sports.\n,ReqShowGame
2160,Any windows Sports games with Feature Film lau...,ReqShowGame
2161,I need Party Game Cartoony games with Turn-Bas...,ReqShowGame


In [None]:
test_data['intent'].value_counts()

ReqShowGame    1020
ReqGameInfo     527
BuyGame         405
affirm          114
deny             97
Name: intent, dtype: int64

In [None]:
train_data[train_data['utterance'].isin(test_data['utterance'])]

Unnamed: 0,utterance,intent
17237,Show me the latest Arena Shooter games for lin...,ReqShowGame
19505,Show me the latest Experience games for window...,ReqShowGame
20090,I'm looking for a Moddable game that lets you ...,ReqShowGame
20258,"Hey, I'm in the mood for some Naval games. Any...",ReqShowGame
20327,I love Rome. Recommend me some games in that t...,ReqShowGame
23293,I'm looking for something in the Mystery Dunge...,ReqShowGame
27125,"Yo, got any cool Audio Production games that c...",ReqShowGame
28287,Show me the latest Moddable games for macOS.\n,ReqShowGame


In [None]:
train_data = train_data[~train_data['utterance'].isin(test_data['utterance'])]
train_data = train_data.reset_index()
train_data = train_data.drop(columns='index')
train_data

Unnamed: 0,utterance,intent
0,I want to buy the game Skyrim.,BuyGame
1,I'm interested in buying Grand Theft Auto V.,BuyGame
2,I'm looking to purchase Red Dead Redemption 2.,BuyGame
3,I want to buy a game. The title is The Witcher...,BuyGame
4,I want to buy Assassin's Creed Odyssey.,BuyGame
...,...,...
29115,"I'm into Hack and Slash games, especially Sub-...",ReqShowGame
29116,I'm a fan of Makivision Games. Can you recomme...,ReqShowGame
29117,What are the top 4X games on steamOS that have...,ReqShowGame
29118,Show me Benchmark Space games with a Horror se...,ReqShowGame


In [None]:
train_data[train_data['intent']=='ReqShowGame']['utterance'].to_csv('train.txt')

### Create vocabulary

In [None]:
steam_reviews_df = pd.read_csv(STEAM_REVIEWS_PATH,on_bad_lines='skip')
steam_reviews_df

In [None]:
steam_reviews_df = steam_reviews_df.dropna().sample(frac = 1, random_state=SEED)

In [None]:
used_df_len = int(len(steam_reviews_df) * 0.01)
used_df_len

In [None]:
steam_reviews_df.iloc[:used_df_len,:]

In [None]:
used_data = []
curr_tokens = []
token_counts = {}

for i in tqdm(range(used_df_len)):
    data = steam_reviews_df.iloc[i]['review_text']

    curr_tokens = tokenize(data,nlp,custom_stopwords)
    for t in curr_tokens:
        token_counts[t] = token_counts.get(t,0) + 1

In [None]:
vocab = pd.DataFrame(token_counts.items(), columns=["token",'counts'])
vocab = vocab.sort_values(by=['counts'], ascending=False)
vocab = vocab.reset_index()
vocab

In [None]:
vocab = vocab[:vocab_size]
vocab

In [None]:
vocab.iloc[-1] = vocab.iloc[-1] = [-1, '<UNK>',0]
vocab

In [None]:
vocab.to_csv('/content/drive/MyDrive/Eletivas/PLN/vocab_steam.csv', columns=['token', 'counts'], index=False)

### Using game vocabulary

In [None]:
vocab = pd.read_csv('/content/drive/MyDrive/Eletivas/PLN/vocab_steam.csv')
vocab

Unnamed: 0,token,counts
0,game,90782
1,<NUM>,64136
2,not,47068
3,in,37833
4,play,26441
...,...,...
2995,naturally,53
2996,detract,53
2997,mb,53
2998,emperor,53


In [None]:
def remove_rare_tokens(tokens: list, vocab_tokens):
    return [t if t in vocab_tokens else '<UNK>' for t in tokens]

In [None]:
def build_token2index(vocab):
    token2index = {}
    for i in range(len(vocab)):
        token = vocab['token'].values[i]
        token2index[token] = i
    return token2index

In [None]:
def build_index2token(vocab):
    index2token = {}
    for i in range(len(vocab)):
        token = vocab['token'].values[i]
        index2token[i] = token
    return index2token

In [None]:
def build_bow_vector(sequence, idx2token):
    vector = [0] * len(idx2token)
    for token_idx in sequence:
        if token_idx not in idx2token:
            raise ValueError('Wrong sequence index found!')
        else:
            vector[token_idx] += 1
    return vector

In [None]:
index2token = build_index2token(vocab)
token2index = build_token2index(vocab)

In [None]:
def generate_bow_vectors(utterances):
    bow_vectors = []

    for i in tqdm(range(len(utterances))):
        tmp = utterances[i]

        # Removes leadings break lines
        if tmp[-1] == '\n':
            tmp=tmp[:-1]

        tmp_tokens = tokenize(tmp, nlp, custom_stopwords)

        tmp_tokens = remove_rare_tokens(tmp_tokens, vocab['token'].values)

        token_indexes = list(map(lambda x: token2index[x], tmp_tokens))

        bow_vector = build_bow_vector(token_indexes, index2token)

        bow_vectors.append(bow_vector)

    return bow_vectors


In [None]:
train_bows = generate_bow_vectors(train_data['utterance'])

100%|██████████| 29120/29120 [07:37<00:00, 63.59it/s] 


In [None]:
test_bows = generate_bow_vectors(test_data['utterance'])

100%|██████████| 2163/2163 [00:32<00:00, 65.79it/s]


In [None]:
train_bows_df = pd.DataFrame(train_bows, columns=vocab['token'])
train_bows_df['intent'] = train_data['intent']
train_bows_df

token,game,<NUM>,not,in,play,on,can,get,good,like,...,wet,cheese,establish,skilled,naturally,detract,mb,emperor,<UNK>,intent
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,BuyGame
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,BuyGame
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,BuyGame
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,BuyGame
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,BuyGame
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29115,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,ReqShowGame
29116,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,ReqShowGame
29117,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,ReqShowGame
29118,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,ReqShowGame


In [None]:
test_bows_df = pd.DataFrame(test_bows, columns=vocab['token'])
test_bows_df['intent'] = test_data['intent']
test_bows_df

token,game,<NUM>,not,in,play,on,can,get,good,like,...,wet,cheese,establish,skilled,naturally,detract,mb,emperor,<UNK>,intent
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,affirm
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,affirm
2,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,affirm
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,affirm
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,affirm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2158,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,ReqShowGame
2159,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ReqShowGame
2160,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ReqShowGame
2161,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ReqShowGame


In [None]:
# Export train bows df
train_bows_df.to_csv(r'/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/train_bow_dataset_v3.csv')

In [None]:
# Export test bows df
test_bows_df.to_csv(r'/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/test_bow_dataset_v3.csv')

In [None]:
train_bows_df = pd.read_csv(r'/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/train_bow_dataset_v3.csv')
test_bows_df = pd.read_csv(r'/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/test_bow_dataset_v3.csv')

In [None]:
train_bows_df.drop(columns='Unnamed: 0', inplace=True)
test_bows_df.drop(columns='Unnamed: 0', inplace=True)

### Training SVM

In [None]:
X_train = np.array(train_bows_df.drop(columns=['intent']))
y_train = train_bows_df['intent']

X_train[1], y_train[1]

(array([0, 0, 0, ..., 0, 0, 0]), 'BuyGame')

In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_train

array([0, 0, 0, ..., 2, 2, 2])

In [None]:
train_bows_df['intent'].value_counts()

ReqShowGame    12428
ReqGameInfo     8794
BuyGame         7406
affirm           251
deny             241
Name: intent, dtype: int64

In [None]:
x_test = np.array(test_bows_df.drop(columns='intent'))
y_test = test_bows_df['intent']
y_test = label_encoder.transform(y_test)

x_test[1],y_test[1]

(array([0, 0, 0, ..., 0, 0, 0]), 3)

In [None]:
model = svm.LinearSVC(random_state=SEED)
model

In [None]:
model.fit(X_train, y_train)

In [None]:
y_preds = model.predict(x_test)
y_preds

array([3, 3, 4, ..., 2, 2, 2])

In [None]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       405
           1       1.00      0.98      0.99       527
           2       0.99      1.00      0.99      1020
           3       0.97      0.93      0.95       114
           4       0.99      0.91      0.95        97

    accuracy                           0.98      2163
   macro avg       0.98      0.96      0.97      2163
weighted avg       0.98      0.98      0.98      2163



In [None]:
dump(model, SVM_MODEL_PATH)

['/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/modelos/svm.joblib']

In [None]:
model = load(SVM_MODEL_PATH)

In [None]:
sample_bow = generate_bow_vectors(['I want to buy Castlevania: Symphony of the night',
                                   'I want to play a new RPG game, any recomendation?',
                                   'I don\'t want this game',
                                   'I loved it!',
                                   'What is the genre of Hollow Knigth?'],)
sample_bow = np.array(sample_bow)
sample_bow.shape

100%|██████████| 5/5 [00:00<00:00, 43.58it/s]


(5, 3000)

In [None]:
label_encoder.inverse_transform(model.predict(sample_bow))

array(['BuyGame', 'ReqShowGame', 'deny', 'affirm', 'ReqGameInfo'],
      dtype=object)

### CNN

In [None]:
vocab = pd.read_csv('/content/drive/MyDrive/Eletivas/PLN/vocab_steam.csv')

In [None]:
vocab.index = vocab.index + 1
vocab

Unnamed: 0,token,counts
1,game,90782
2,<NUM>,64136
3,not,47068
4,in,37833
5,play,26441
...,...,...
2996,naturally,53
2997,detract,53
2998,mb,53
2999,emperor,53


In [None]:
vocab.iloc[-2]

token     emperor
counts         53
Name: 2999, dtype: object

In [None]:
BACTH_SIZE = 1024

In [None]:
def pad_tokenized(tokenized_text, word2idx, max_len_sentence):
    padded_tokens = tokenized_text + ['<PAD>'] * (max_len_sentence - len(tokenized_text))
    input_id = [word2idx.get(token, word2idx['<PAD>']) for token in padded_tokens]
    return np.array(input_id)

In [None]:
index2token = build_index2token(vocab)
token2index = build_token2index(vocab)

In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  remove_special = tf.strings.regex_replace(lowercase, r'[^\w\s]', ' ')
  changed_nums = tf.strings.regex_replace(remove_special, r'\b\d+\b', '<NUM>')
  return tf.strings.regex_replace(changed_nums,
                                  '[%s]' % re.escape(string.punctuation), '')

In [None]:
tmp = train_data['utterance'][0]
print(train_data['utterance'][0])
custom_standardization(tmp)

I want to buy the game Skyrim.


<tf.Tensor: shape=(), dtype=string, numpy=b'i want to buy the game skyrim '>

In [None]:
# Vocabulary size and number of words in a sequence.
sequence_length = 100

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    vocabulary=vocab['token'].values.tolist(),
    output_mode='int',
    output_sequence_length=sequence_length)

In [None]:
X_train = np.array(train_data['utterance'])
X_train

array(['I want to buy the game Skyrim.',
       "I'm interested in buying Grand Theft Auto V.",
       "I'm looking to purchase Red Dead Redemption 2.", ...,
       'What are the top 4X games on steamOS that have a Arena Shooter?\n',
       'Show me Benchmark Space games with a Horror setting.\n',
       'Show me Minimalist games that are set in the LEGO.\n'],
      dtype=object)

In [None]:
X_test = np.array(test_data['utterance'])
X_test

array(["Yeah, let's get this game!", "Of course, I'm in!",
       "Definitely, can't wait to play it!", ...,
       'Any windows Sports games with Feature Film launched in 1997?\n',
       'I need Party Game Cartoony games with Turn-Based Combat.\n',
       'What are the latest Warhammer 40K games with Strategy for linux?\n'],
      dtype=object)

In [None]:
y_train = train_data['intent']
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_train = one_hot_encoder.fit_transform(y_train.reshape(-1, 1))
y_train

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [None]:
y_test = test_data['intent']
label_encoder = LabelEncoder()
y_test = label_encoder.fit_transform(y_test)
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_test = one_hot_encoder.fit_transform(y_test.reshape(-1, 1))
y_test

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [None]:
X_test.shape, y_test.shape

((2163,), (2163, 5))

In [None]:
len(vocab)

3000

In [None]:
embedding_dim=16
input_shape = (None,sequence_length)
num_classes = 5

model = tf.keras.Sequential([
  vectorize_layer,
  tf.keras.layers.Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length),
  tf.keras.layers.Conv1D(64, 3, activation='relu'),
  tf.keras.layers.MaxPooling1D(2),
  tf.keras.layers.Conv1D(128, 3, activation='relu'),
  tf.keras.layers.MaxPooling1D(2),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7fc8e722dc90>

In [None]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_pred



array([3, 3, 3, ..., 2, 2, 2])

In [None]:
y_test_cr = np.argmax(y_test, axis=1)
classification_report_result = classification_report(y_test_cr, y_pred)

print(classification_report_result)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       405
           1       0.95      0.93      0.94       527
           2       1.00      1.00      1.00      1020
           3       0.97      0.96      0.97       114
           4       0.86      0.61      0.71        97

    accuracy                           0.96      2163
   macro avg       0.94      0.90      0.91      2163
weighted avg       0.96      0.96      0.96      2163



In [None]:
model.save(CNN_MODEL_PATH)

### LSTM

In [None]:
vocab = pd.read_csv('/content/drive/MyDrive/Eletivas/PLN/vocab_steam.csv')
vocab.index = vocab.index + 1
vocab

Unnamed: 0,token,counts
1,game,90782
2,<NUM>,64136
3,not,47068
4,in,37833
5,play,26441
...,...,...
2996,naturally,53
2997,detract,53
2998,mb,53
2999,emperor,53


In [None]:
BACTH_SIZE = 1024

In [None]:
# Vocabulary size and number of words in a sequence.
sequence_length = 100

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    vocabulary=vocab['token'].values.tolist(),
    output_mode='int',
    max_tokens=len(vocab['token'].values)+2)

In [None]:
# X_train = tokenize_embbedings(train_data['utterance'],50, token2index)
X_train = np.array(train_data['utterance'])
X_train

array(['I want to buy the game Skyrim.',
       "I'm interested in buying Grand Theft Auto V.",
       "I'm looking to purchase Red Dead Redemption 2.", ...,
       'What are the top 4X games on steamOS that have a Arena Shooter?\n',
       'Show me Benchmark Space games with a Horror setting.\n',
       'Show me Minimalist games that are set in the LEGO.\n'],
      dtype=object)

In [None]:
# X_test = tokenize_embbedings(test_data['utterance'],50, token2index)
X_test = np.array(test_data['utterance'])
X_test

array(["Yeah, let's get this game!", "Of course, I'm in!",
       "Definitely, can't wait to play it!", ...,
       'Any windows Sports games with Feature Film launched in 1997?\n',
       'I need Party Game Cartoony games with Turn-Based Combat.\n',
       'What are the latest Warhammer 40K games with Strategy for linux?\n'],
      dtype=object)

In [None]:
y_train = train_data['intent']
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_train = one_hot_encoder.fit_transform(y_train.reshape(-1, 1))
y_train

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [None]:
y_test = test_data['intent']
label_encoder = LabelEncoder()
y_test = label_encoder.fit_transform(y_test)
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_test = one_hot_encoder.fit_transform(y_test.reshape(-1, 1))
y_test

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [None]:
X_test.shape, y_test.shape

((2163,), (2163, 5))

In [None]:
embedding_dim=16
input_shape = (None,sequence_length)
num_classes = 5

model = tf.keras.Sequential([
  vectorize_layer,
  tf.keras.layers.Embedding(len(vocab), embedding_dim, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7fc8e4dd1f00>

In [None]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_pred



array([3, 0, 3, ..., 2, 2, 2])

In [None]:
y_test_cr = np.argmax(y_test, axis=1)
classification_report_result = classification_report(y_test_cr, y_pred)

print(classification_report_result)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       405
           1       0.98      0.95      0.97       527
           2       0.98      1.00      0.99      1020
           3       0.93      0.89      0.91       114
           4       1.00      0.75      0.86        97

    accuracy                           0.97      2163
   macro avg       0.96      0.91      0.93      2163
weighted avg       0.97      0.97      0.97      2163



In [None]:
model.save(LSTM_MODEL_PATH)

### Transformer

In [None]:
!pip install datasets py7zr textstat
! pip install -U accelerate
! pip install -U transformers

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting py7zr
  Downloading py7zr-0.21.0-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset, load_metric, get_dataset_split_names, Dataset, DatasetDict, concatenate_datasets
import torch
import os

In [None]:
# Definindo o device para ser a GPU
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

#### Carregando o DistilBERT pré-treinado

In [None]:
n_labels = len(train_data['intent'].value_counts())
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=n_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

#### Pré-processando os dados

In [None]:
import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

In [None]:
train_data_dbert = pd.DataFrame()
train_data_dbert['text'] = train_data[['utterance']]
train_data_dbert['label'] = train_data[['intent']]

In [None]:
test_data_dbert = pd.DataFrame()
test_data_dbert['text']  = test_data[['utterance']]
test_data_dbert['label'] = test_data[['intent']]

In [None]:
# Preparando dataset de treino
hf_dataset_tr = Dataset.from_pandas(train_data_dbert)
hf_dataset_tr = hf_dataset_tr.class_encode_column("label")

# Preparando dataset de teste
hf_dataset_ts = Dataset.from_pandas(test_data_dbert)
hf_dataset_ts = hf_dataset_ts.class_encode_column("label")

Casting to class labels:   0%|          | 0/29120 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2163 [00:00<?, ? examples/s]

In [None]:
full_dataset = DatasetDict({"train": hf_dataset_tr, "test": hf_dataset_ts})
full_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 29120
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2163
    })
})

In [None]:
def tokenization(example,feat_name):
  outputs = tokenizer(example[feat_name], truncation=True)
  return outputs

In [None]:
tokenized_full_dataset = full_dataset.map(lambda x : tokenization(x,'text'),
                                     batched=True)

Map:   0%|          | 0/29120 [00:00<?, ? examples/s]

Map:   0%|          | 0/2163 [00:00<?, ? examples/s]

In [None]:
tokenized_full_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 29120
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2163
    })
})

#### Preparando o trainer

In [None]:
training_args = TrainingArguments(output_dir="distilbert",
                                  num_train_epochs=1,
                                  evaluation_strategy="epoch",
                                  seed=SEED)

def compute_metrics(eval_preds):
    accuracy_metric = load_metric("accuracy")
    f1_metric = load_metric("f1")

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]

    return {"accuracy": accuracy, "f1":f1}

model = model.to(device)
trainer = Trainer(model=model, tokenizer=tokenizer,
                  args=training_args,
                  train_dataset=tokenized_full_dataset['train'],
                  eval_dataset=tokenized_full_dataset['test'],
                  compute_metrics=compute_metrics)

#### Treinando o modelo

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0007,0.016424,0.996301,0.987705


  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

TrainOutput(global_step=3640, training_loss=0.01822173725608941, metrics={'train_runtime': 231.9137, 'train_samples_per_second': 125.564, 'train_steps_per_second': 15.695, 'total_flos': 168652439855520.0, 'train_loss': 0.01822173725608941, 'epoch': 1.0})

In [None]:
#Salvando o modelo
model.save_pretrained('/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/distilbert_pretrained')

#### Testando o modelo no dataset de teste

In [None]:
preds = trainer.predict(tokenized_full_dataset["test"])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
print('Test loss: ' + str(preds.metrics['test_loss']))
print('Test accuracy: ' + str(preds.metrics['test_accuracy']))
print('Test f1: ' + str(preds.metrics['test_f1']))

Test loss: 0.01642412506043911
Test accuracy: 0.996301433194637
Test f1: 0.98770545030426


In [None]:
n_labels = len(train_data['intent'].value_counts())
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/Eletivas/PLN/Steam Chatbot/collabs/distilbert_pretrained', num_labels=n_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
preds = trainer.predict(tokenized_full_dataset["test"])

  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [None]:
preds[0]

array([[-2.7448337, -2.45288  , -2.6494396,  5.9816003, -1.6259763],
       [-2.6547499, -2.490821 , -2.633326 ,  5.9385705, -1.6327167],
       [-2.4952765, -2.676762 , -2.8580205,  5.9412837, -1.3994263],
       ...,
       [-3.4437072, -3.407265 ,  9.031994 , -3.6027853, -3.0325456],
       [-3.4458663, -3.4086895,  9.015412 , -3.6457841, -2.961211 ],
       [-3.4387121, -3.377926 ,  9.017804 , -3.616201 , -3.0404682]],
      dtype=float32)

In [None]:
y_pred = np.argmax(preds[0], axis=1)
y_pred

array([3, 3, 3, ..., 2, 2, 2])

In [None]:
y_true = np.array(tokenized_full_dataset["test"]['label'])
y_true

array([3, 3, 3, ..., 2, 2, 2])

In [None]:
report = classification_report(y_true, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       405
           1       1.00      1.00      1.00       527
           2       1.00      1.00      1.00      1020
           3       0.97      0.97      0.97       114
           4       0.98      0.96      0.97        97

    accuracy                           1.00      2163
   macro avg       0.99      0.99      0.99      2163
weighted avg       1.00      1.00      1.00      2163

