In [1]:
import requests
from bs4 import BeautifulSoup
import re 
import pandas as pd
from tqdm import tqdm
from IPython.display import display
import numpy as np
from difflib import SequenceMatcher
from scipy.spatial.distance import cosine

# Filter out old games

In [2]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [3]:
new_games = pd.read_excel('Game_list.xlsx', engine= 'openpyxl')

In [4]:
df_no_ind = new_games.drop(columns = ["Unnamed: 0"]).sort_values(by = ['Developer'])
df_no_ind = df_no_ind[df_no_ind['Title'].notna()].reset_index(drop=True)

In [5]:
old_games = pd.read_csv('old games.csv')

In [6]:
old_game_list = np.array(old_games.iloc[:, 2].dropna())

In [7]:
old_game_embedding = sbert_model.encode(old_game_list)


In [8]:
def preprocess_string(tstring):
    tstring = tstring.lower()
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
 
    # Removing punctuations in string
    # Using loop + punctuation string
    for ele in tstring:
        if ele in punc:
            tstring = tstring.replace(ele, "")
    
    return tstring

def build_bigram(doc):
    bigrams = []
    for title in doc:
        title = preprocess_string(title).split(" ")

        for i in range(len(title)-1):
            bigram = title[i] + " " + title[i+1]
            bigrams.append(bigram)
    return bigrams
def cosine_similarity(a, b): 
    return cosine(a, b)
def sbert_exist(a, doc):
    similarity_scores = [cosine_similarity(a,x) for x in doc]
    #print(max(similarity_scores))
    return max(similarity_scores) >= 0.8

In [51]:
sbert_exist(sbert_model.encode(["Asagi"])[0], vectorized_old_game)

[0.4654887914657593, 0.5161513090133667, 0.37991076707839966, 0.21226614713668823, 0.28063464164733887, 0.17509132623672485, 0.4365198016166687, 0.2557142376899719, 0.3489704728126526, 0.5908237993717194, 0.16487818956375122, 0.28953051567077637, 0.21751701831817627, 0.2441953420639038, 0.19770312309265137, 0.21794140338897705, 0.5784018635749817, 0.5053529739379883, 0.5561661422252655, 0.5618811845779419, 0.493968665599823, 0.640789806842804, 0.6325919032096863, 0.6392700374126434, 0.6840862929821014, 0.36838042736053467, 0.4180886149406433, 0.4332786202430725, 0.4698740243911743, 0.5112892091274261, 0.41213661432266235, 0.3639005422592163, 0.5832080245018005, 0.8726400285959244, 0.19854885339736938, 0.23618966341018677, 0.23808139562606812, 0.14753925800323486, 0.4342358112335205, 0.42366981506347656, 0.4271507263183594, 0.6487479209899902, 0.4223390817642212, 0.26363319158554077, 0.33380424976348877, 0.45104116201400757, 0.41792070865631104, 0.5084426999092102, 0.3688521981239319, 0

True

SBert Embedding + cosine similarity 

In [12]:
preprocessed_old_game_list = [preprocess_string(title) for title in old_game_list]
vectorized_old_game = [sbert_model.encode([title])[0] for title in preprocessed_old_game_list]

In [72]:
remove_list = []
for index, row in tqdm(df_no_ind.iterrows()):
    #print(index)
    title = row['Title']
    #print(title)
    preprocessed_title = preprocess_string(title)
    vectorized = sbert_model.encode([preprocessed_title])[0] 
    if sbert_exist(vectorized, vectorized_old_game):
        remove_list.append(index)

1011it [00:25, 39.55it/s]


In [73]:
len(remove_list)

679

In [74]:
dropped_similar = df_no_ind.drop(df_no_ind.index[remove_list])

In [76]:
dropped_similar.to_excel('SNewGame_list.xlsx')

In [9]:
old_game_bigram = build_bigram(old_game_list)

In [13]:
def similar(a, b):
    a = preprocess_string(a)
    b = preprocess_string(b)
    score = SequenceMatcher(None, a, b).ratio()
    
    if a in b or b in a:
        score = 1.0
    # a_list = a.split(" ")
    # b_list = b.split(" ")
    # if len(list(set(a_list) & set(b_list))) != 0:
    #     return 1.0
    return score
def in_bigram(a, old_bigram):
    for biword in old_bigram:
        if SequenceMatcher(None, a, biword).ratio() >= 0.6:
            return True
def exist(a, doc, old_game_bigram):
    a_bigram = build_bigram([a])
    for biword in a_bigram:
        if in_bigram(biword, old_game_bigram):
            return True
    for word in doc:
        a = preprocess_string(a)
        b = preprocess_string(word)
        if a in b or b in a:
            return True
        score = SequenceMatcher(None, a, b).ratio()
        if score >= 0.6:
            return True
    preprocessed_title = preprocess_string(a)
    vectorized = sbert_model.encode([preprocessed_title])[0] 
    similarity_scores = [cosine(vectorized,x) for x in vectorized_old_game]
    return max(similarity_scores) >= 0.81

In [14]:
remove_list = []
for index, row in tqdm(df_no_ind.iterrows()):
    title = row['Title']
    if exist(title, old_game_list, old_game_bigram):
        remove_list.append(index)
"length of remove_list:" + str(len(remove_list))

1011it [00:24, 41.27it/s]


'length of remove_list:882'

In [15]:
dropped_similar = df_no_ind.drop(df_no_ind.index[remove_list])
dropped_similar.to_excel('SNewGame_list.xlsx')

In [16]:
dropped_similar

Unnamed: 0,Title,Original Title,Developer,URL
6,"UuultraC, ウルC",ウウウルトラC,ADELTA,https://www.ryuugames.com/eng-uuultrac-uncenso...
8,Onigokko!,鬼ごっこ！,ALcot,https://www.ryuugames.com/eng-onigokko-free-do...
21,Daiteikoku,大帝国,Alice Soft,https://www.ryuugames.com/daiteikoku-free-down...
22,Rance 4.1,ランス４．１　～お薬工場を救え！～,Alice Soft,https://www.ryuugames.com/eng-rance-4-1-save-t...
23,Rance 4.2 ~Angelgumi~,ランス４．２　～エンジェル組～,Alice Soft,https://www.ryuugames.com/eng-rance-4-2-angelg...
...,...,...,...,...
971,Rance Quest Magnum,ランス・クエスト マグナム,,https://www.ryuugames.com/eng-rance-quest-magn...
981,Hatoful Boyfriend [ENG],はーとふる彼氏〜希望の学園と白い翼〜,,https://www.ryuugames.com/eng-hatoful-kareshi-...
984,Death end re;Quest,Death end re;Quest,,https://www.ryuugames.com/eng-death-end-reques...
997,Aoishiro,アオイシロ,,https://www.ryuugames.com/eng-aoishiro-free-do...


In [None]:
similar('Kyonyuu Fantasy Gaiden / Funbag Fantasy : Sideboob Story', 'Funbag Fantasy')

0.4