# Import Libraries

In [7]:
import requests
import json
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics.pairwise import cosine_distances

from sentence_transformers import SentenceTransformer

# Data Loading

## YgoPro API (no need to rerun)

In [8]:
# url = "https://db.ygoprodeck.com/api/v7/cardinfo.php"

# Response = requests.get(url)

# if Response.status_code == 200:
#     Cards = Response.json().get("data", [])

#     with open("yugioh_cards.json", "w") as f:
#         json.dump(Cards, f, indent=4)
    
#     print(f"Saved {len(Cards)} cards locally.")
# else:
#     print("Error fetching data:", Response.status_code)

In [9]:
df = pd.read_json('yugioh_cards.json')

# Data Exploration

In [10]:
df.head(5)

Unnamed: 0,id,name,type,humanReadableCardType,frameType,desc,race,archetype,ygoprodeck_url,card_sets,...,atk,def,level,attribute,linkval,linkmarkers,pend_desc,monster_desc,scale,banlist_info
0,34541863,"""A"" Cell Breeding Device",Spell Card,Continuous Spell,spell,"During each of your Standby Phases, put 1 A-Co...",Continuous,Alien,https://ygoprodeck.com/card/a-cell-breeding-de...,"[{'set_name': 'Force of the Breaker', 'set_cod...",...,,,,,,,,,,
1,64163367,"""A"" Cell Incubator",Spell Card,Continuous Spell,spell,Each time an A-Counter(s) is removed from play...,Continuous,Alien,https://ygoprodeck.com/card/a-cell-incubator-8856,"[{'set_name': 'Gladiator's Assault', 'set_code...",...,,,,,,,,,,
2,91231901,"""A"" Cell Recombination Device",Spell Card,Quick-Play Spell,spell,Target 1 face-up monster on the field; send 1 ...,Quick-Play,Alien,https://ygoprodeck.com/card/a-cell-recombinati...,"[{'set_name': 'Invasion: Vengeance', 'set_code...",...,,,,,,,,,,
3,73262676,"""A"" Cell Scatter Burst",Spell Card,Quick-Play Spell,spell,"Select 1 face-up ""Alien"" monster you control. ...",Quick-Play,Alien,https://ygoprodeck.com/card/a-cell-scatter-bur...,"[{'set_name': 'Strike of Neos', 'set_code': 'S...",...,,,,,,,,,,
4,98319530,"""Infernoble Arms - Almace""",Spell Card,Equip Spell,spell,While this card is equipped to a monster: You ...,Equip,Infernoble Arms,https://ygoprodeck.com/card/infernoble-arms-al...,"[{'set_name': 'Duelist Nexus', 'set_code': 'DU...",...,,,,,,,,,,


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13567 entries, 0 to 13566
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     13567 non-null  int64  
 1   name                   13567 non-null  object 
 2   type                   13567 non-null  object 
 3   humanReadableCardType  13567 non-null  object 
 4   frameType              13567 non-null  object 
 5   desc                   13567 non-null  object 
 6   race                   13567 non-null  object 
 7   archetype              8043 non-null   object 
 8   ygoprodeck_url         13567 non-null  object 
 9   card_sets              13097 non-null  object 
 10  card_images            13567 non-null  object 
 11  card_prices            13567 non-null  object 
 12  typeline               8712 non-null   object 
 13  atk                    8712 non-null   float64
 14  def                    8286 non-null   float64
 15  le

In [12]:
df2 = df[['id','name','frameType','desc','race']]

In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13567 entries, 0 to 13566
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         13567 non-null  int64 
 1   name       13567 non-null  object
 2   frameType  13567 non-null  object
 3   desc       13567 non-null  object
 4   race       13567 non-null  object
dtypes: int64(1), object(4)
memory usage: 530.1+ KB


In [14]:
df2.head()

Unnamed: 0,id,name,frameType,desc,race
0,34541863,"""A"" Cell Breeding Device",spell,"During each of your Standby Phases, put 1 A-Co...",Continuous
1,64163367,"""A"" Cell Incubator",spell,Each time an A-Counter(s) is removed from play...,Continuous
2,91231901,"""A"" Cell Recombination Device",spell,Target 1 face-up monster on the field; send 1 ...,Quick-Play
3,73262676,"""A"" Cell Scatter Burst",spell,"Select 1 face-up ""Alien"" monster you control. ...",Quick-Play
4,98319530,"""Infernoble Arms - Almace""",spell,While this card is equipped to a monster: You ...,Equip


In [15]:
df2 = df2.iloc[:, [0,1,2,4,3]]

In [16]:
df2.head()

Unnamed: 0,id,name,frameType,race,desc
0,34541863,"""A"" Cell Breeding Device",spell,Continuous,"During each of your Standby Phases, put 1 A-Co..."
1,64163367,"""A"" Cell Incubator",spell,Continuous,Each time an A-Counter(s) is removed from play...
2,91231901,"""A"" Cell Recombination Device",spell,Quick-Play,Target 1 face-up monster on the field; send 1 ...
3,73262676,"""A"" Cell Scatter Burst",spell,Quick-Play,"Select 1 face-up ""Alien"" monster you control. ..."
4,98319530,"""Infernoble Arms - Almace""",spell,Equip,While this card is equipped to a monster: You ...


In [17]:
df2.isna().sum()

id           0
name         0
frameType    0
race         0
desc         0
dtype: int64

In [18]:
df2.drop_duplicates()

Unnamed: 0,id,name,frameType,race,desc
0,34541863,"""A"" Cell Breeding Device",spell,Continuous,"During each of your Standby Phases, put 1 A-Co..."
1,64163367,"""A"" Cell Incubator",spell,Continuous,Each time an A-Counter(s) is removed from play...
2,91231901,"""A"" Cell Recombination Device",spell,Quick-Play,Target 1 face-up monster on the field; send 1 ...
3,73262676,"""A"" Cell Scatter Burst",spell,Quick-Play,"Select 1 face-up ""Alien"" monster you control. ..."
4,98319530,"""Infernoble Arms - Almace""",spell,Equip,While this card is equipped to a monster: You ...
...,...,...,...,...,...
13562,2648201,ZW - Sleipnir Mail,effect,Beast,"You can target 1 ""Utopia"" monster you control;..."
13563,95886782,ZW - Sylphid Wing,effect,Beast,"You can only control 1 ""ZW - Sylphid Wing"". Yo..."
13564,81471108,ZW - Tornado Bringer,effect,Dragon,"You can target 1 ""Utopia"" monster you control;..."
13565,18865703,ZW - Ultimate Shield,effect,Aqua,When this card is Normal or Special Summoned: ...


# Feature Engineering

In [19]:
# function to clean text
def CleanText(text):
    text = text.lower() # make it lowercase
    text = re.sub(r'\W+', ' ', text) # remove special character and punctuation
    text = re.sub(r'\s+', ' ', text).strip() # remove spaces before and after desc
    return text

In [20]:
df2['clean_desc'] = df2['desc'].apply(CleanText)
df2[['desc', 'clean_desc']].head()

Unnamed: 0,desc,clean_desc
0,"During each of your Standby Phases, put 1 A-Co...",during each of your standby phases put 1 a cou...
1,Each time an A-Counter(s) is removed from play...,each time an a counter s is removed from play ...
2,Target 1 face-up monster on the field; send 1 ...,target 1 face up monster on the field send 1 a...
3,"Select 1 face-up ""Alien"" monster you control. ...",select 1 face up alien monster you control des...
4,While this card is equipped to a monster: You ...,while this card is equipped to a monster you c...


## Making Tokens by Splitting it using Space

In [21]:
df2['tokens'] = df2['clean_desc'].apply(lambda x: x.split())

In [22]:
df2[['tokens','clean_desc']].head()

Unnamed: 0,tokens,clean_desc
0,"[during, each, of, your, standby, phases, put,...",during each of your standby phases put 1 a cou...
1,"[each, time, an, a, counter, s, is, removed, f...",each time an a counter s is removed from play ...
2,"[target, 1, face, up, monster, on, the, field,...",target 1 face up monster on the field send 1 a...
3,"[select, 1, face, up, alien, monster, you, con...",select 1 face up alien monster you control des...
4,"[while, this, card, is, equipped, to, a, monst...",while this card is equipped to a monster you c...


## Creating Vectors

In [23]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 most important words

# Transform the cleaned descriptions into TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(df2['clean_desc'])

# Convert the TF-IDF matrix into a DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [24]:
tfidf_df.head()

Unnamed: 0,000,003,004,005,006,009,05,06,07,0x00,...,zorc,zoroa,zs,zubaba,zubababancho,zuijin,zushin,zuttomozaurus,zw,zweite
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Creation

## SBERT

In [25]:
# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert card descriptions into embeddings
sentence_embeddings = sbert_model.encode(df2['desc'], show_progress_bar=True)

# Convert embeddings to a NumPy array
sentence_embeddings = np.array(sentence_embeddings)

# Check embedding shape
print(sentence_embeddings.shape)

Batches:   0%|          | 0/424 [00:00<?, ?it/s]

(13567, 384)


## Cosine Distance

In [26]:
# Compute cosine distance matrix (instead of 1 - cosine similarity)
cosine_dist_matrix = cosine_distances(sentence_embeddings)

# Apply DBSCAN clustering using the corrected distance matrix
dbscan = DBSCAN(metric="precomputed", eps=0.3, min_samples=5)  # Tune eps & min_samples as needed
cluster_labels = dbscan.fit_predict(cosine_dist_matrix)

# Add cluster labels to the dataframe
df2["cluster"] = cluster_labels

# Display the number of clusters found (excluding noise points, labeled as -1)
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)

print(f"Number of clusters found: {num_clusters}")
print(df2["cluster"].value_counts())

Number of clusters found: 8
cluster
 0    11914
-1     1598
 2       13
 3       10
 4        9
 7        7
 6        6
 1        5
 5        5
Name: count, dtype: int64


## Use KMeans

In [27]:
# Define the number of clusters (start with 10, can be adjusted)
num_clusters = 10

# Apply K-Means clustering on SBERT embeddings
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(sentence_embeddings)

# Assign cluster labels to the DataFrame
df2["cluster"] = cluster_labels

# Show cluster distribution
print(df2["cluster"].value_counts())


cluster
2    2551
5    2079
9    1929
6    1780
0    1407
8    1390
7     748
3     633
4     568
1     482
Name: count, dtype: int64
