<a href="https://colab.research.google.com/github/ElektrosStulpas/DeepLearningVU22/blob/main/GMMThird.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install beautifulsoup4
!pip install requests
!pip install transformers
!pip install datasets
!pip install faiss-cpu
!pip install faiss-gpu
!pip install -U sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/GMM3/data.csv")
df.head()

Unnamed: 0,language,review,votes_up,text
0,english,"If you are reading this, STOP READING REVIEWS....",1816,"english\nIf you are reading this, STOP READING..."
1,english,I'm saddened that I will probably never experi...,387,english\nI'm saddened that I will probably nev...
2,english,For $24.99 you can play a piece of art that wi...,440,english\nFor $24.99 you can play a piece of ar...
3,english,The worst thing about this game is that I can ...,1131,english\nThe worst thing about this game is th...
4,english,the only bad thing about this game is you only...,380,english\nthe only bad thing about this game is...


In [None]:
len(df)

11000

In [None]:
from datasets import Dataset

ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['language', 'review', 'votes_up', 'text'],
    num_rows: 11000
})

In [None]:
from transformers import ElectraTokenizer, ElectraModel, GPT2Tokenizer, GPT2Model
import torch

def get_Electra():
  tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
  model = ElectraModel.from_pretrained("google/electra-small-discriminator")
  return tokenizer, model

def get_GPT():
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  model = GPT2Model.from_pretrained('gpt2')
  return tokenizer, model

tokenizer, model = get_Electra()

device = torch.device("cuda")
model.to(device)

In [None]:
#SANITY CHECK
# encoded_input = tokenizer(ds["text"][0], padding=True, truncation=True, return_tensors="pt")
encoded_input = tokenizer(ds["text"][0], truncation=True, return_tensors="pt")
tokenizer.pad_token = tokenizer.eos_token
encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
encoded_input

Using eos_token, but it is not set yet.


{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1]], device='cuda:0'),
 'input_ids': tensor([[  101,  2394,  2065,  2017,  2024,  3752,  2023,  1010,  2644,  3752,
           4391,  1012,  2123,  1005,  1056,  3422,  1996,  2292,  1005,  1055,
           3248,  1010,  2123,  1005,  1056,  3191,  1996,  2208,  4790,  1010,
           2123,  1005,  1056,  2079,  2505,  1012,  1045,  2253,  2046,  2023,
           2208,  4209,  6719,  2498,  2055,  2009,  1012,  1045,  2001,  2074,
           2387,  2009,  2006,  1996,  6749,  3931,  1999,  1996,  3573,  1010,
           1998,  2001,  2066,  10

In [None]:
#SANITY_CHECK
outputs = model(**encoded_input)
# outputs[1]
last_hidden_states = torch.mean(outputs.last_hidden_state, 1)
last_hidden_states.shape

torch.Size([1, 256])

In [None]:
#different pooling approaches
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def mean_pooling(model_output):
  return torch.mean(model_output.last_hidden_state, 1)

In [None]:
def get_embeddings(text_list, pooling_func):
    # encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")
    encoded_input = tokenizer(text_list, truncation=True, return_tensors="pt")
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return pooling_func(model_output)

In [None]:
#SANITY CHECK
embedding = get_embeddings(ds["text"][0], mean_pooling)
embedding.shape

torch.Size([1, 256])

In [None]:
embeddings_dataset = ds.map(
    lambda x: {"embeddings": get_embeddings(x["text"], mean_pooling).detach().cpu().numpy()[0]}
)

  0%|          | 0/11000 [00:00<?, ?ex/s]

In [None]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/11 [00:00<?, ?it/s]

Dataset({
    features: ['language', 'review', 'votes_up', 'text', 'embeddings'],
    num_rows: 11000
})

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

def alt_search(query, num_results, embeddings_dataset, most_similar=True):
  query_embedding = get_embeddings([query], mean_pooling).cpu().detach().numpy()
  cos_scores = util.cos_sim(query_embedding, embeddings_dataset["embeddings"])[0]

  if most_similar:
    top_results = torch.topk(cos_scores, k=num_results)
    samples_df = pd.DataFrame.from_dict(embeddings_dataset[top_results[1]])
    samples_df["scores"] = top_results[0]
  else:
    top_results = torch.topk(torch.negative(cos_scores), k=num_results)
    samples_df = pd.DataFrame.from_dict(embeddings_dataset[top_results[1]])
    samples_df["scores"] = -top_results[0]
  
  return samples_df

def search(query, num_best_results, embeddings_dataset):
  query_embedding = get_embeddings([query], mean_pooling).cpu().detach().numpy()
  scores, samples = embeddings_dataset.get_nearest_examples("embeddings", query_embedding, k=num_best_results)
  samples_df = pd.DataFrame.from_dict(samples)
  samples_df["scores"] = scores
  samples_df.sort_values("scores", ascending=True, inplace=True)
  return samples_df

In [None]:
def print_search_results(df_to_print):
  for _, row in df_to_print.iterrows():
    print(f"language: {row.language}")
    print(f"review: {row.review}")
    print(f"upvotes: {row.votes_up}")
    print(f"score: {row.scores}")
    print("=" * 50)
    print()

In [None]:
samples_df = search("Little Timmy ate an apple", 5, embeddings_dataset)
print_search_results(samples_df)

In [None]:
samples_df = alt_search("I'd rather eat shit than play this game", 5, embeddings_dataset, most_similar=True)
print_search_results(samples_df)

DATA GET

In [None]:
import requests
import time

def get_reviews(appid, params={'json':1}):
  url = f"https://store.steampowered.com/appreviews/{appid}"
  response = requests.get(url, params=params, headers={'User-Agent': 'Mozzila/5.0'})
  return response.json()

def get_n_reviews(appid, n=100):
  reviews=[]
  cursor = '*'
  params = {
            'json' : 1,
            'filter' : 'all',
            'language' : 'english',
            'day_range' : 9223372036854775807,
            'review_type' : 'all',
            'purchase_type' : 'all'
            }

  while n>0:
    params['cursor']=cursor.encode()
    params['num_per_page'] = min(100, n)
    n -= 100

    response = get_reviews(appid, params)
    cursor = response['cursor']
    reviews += response['reviews']
    time.sleep(1)

    # if len(response['reviews']) < 100: break

  return reviews

In [None]:
from bs4 import BeautifulSoup

#get a single app id from search
def get_app_id(game_name):
    response = requests.get(url=f'https://store.steampowered.com/search/?term={game_name}&category1=998', headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')
    app_id = soup.find(class_='search_result_row')['data-ds-appid']
    return app_id

#get batch of app_ids
def get_n_appids(n=100, filter_by='topsellers'):
    appids = []
    url = f'https://store.steampowered.com/search/?category1=998&filter={filter_by}&page='
    page = 0

    while page*25 < n:
        page += 1
        response = requests.get(url=url+str(page), headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')
        for row in soup.find_all(class_='search_result_row'):
            appids.append(row['data-ds-appid'])

    return appids[:n]

In [None]:
appid = get_app_id("Outer Wilds")
reviews = get_n_reviews(appid, 11000)
df = pd.DataFrame(reviews)[['language', 'review', 'votes_up']]
df["text"] = df["language"] + "\n" + df["review"] + "\n" + df["votes_up"].astype(str)
df.to_csv("/content/drive/MyDrive/GMM3/data.csv", index=False)


UTILS

In [None]:
def max_token_len(ds):
  token_len_dataset = []
  max_len_seq= 0

  for sent in ds["text"]:
    tokens = tokenizer.encode(sent) #tokenize
    tokens_len = len(tokens) # count length
    token_len_dataset.append(tokens_len)

    if tokens_len > max_len_seq:
      max_len_seq = tokens_len

  return max_len_seq

max_token_len(ds)

Token indices sequence length is longer than the specified maximum sequence length for this model (1908 > 512). Running this sequence through the model will result in indexing errors


4018

In [None]:
def sentence_embedding1(sentence, model):
  encoded_input = tokenizer(sentence, max_length= 512, padding= 'max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors="pt")
  # encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
  model_output = model(encoded_input['input_ids'], encoded_input['attention_mask'])
  hidden_states = model_output[1]

  concat_embeding = torch.cat(tuple([hidden_states[i] for i in [-4,-3,-2,-1]]), dim = 2) #concatenate last 4 hidden layers
  # print('Dimension after Last_four_layers concatenation: ',concat_embeding.shape )
  padded = encoded_input['attention_mask'].unsqueeze(2) # insert size one at 2 position
  mul_out = torch.mul(concat_embeding, padded)# make zeros vector for paddding tokens
  sent_embeding = mul_out.mean(dim=1) # average pooling  of tokens
  # print('Sentence embedding dimension: ',sent_embeding.shape )
   
  return sent_embeding