In [1]:
import numpy as np
import pandas as pd
import pysubgroup as ps
from mlxtend.frequent_patterns import apriori, association_rules

In [135]:
df_original = pd.read_csv('./Data/tournaments.csv')
df = df_original.copy()

#### Tratando dataframe

In [3]:
df.replace(np.nan, 0, inplace=True)
df.tail()

Unnamed: 0,id_card,name_card,amount_card,price_card,energy_type_card,type_card,combo_type_id,combo_type_name,id_player,name_player,...,region_tournament,country_tournament,year_tournament,month_tournament,day_tournament,valid_rotation_at_tournament,rotation_name,year_begin,month_begin,day_begin
114286,GRI130a,Rescue Stretcher,2,0.72,0,Trainer,47.0,Zoroark Greninja,1900,Jin Woo Lee,...,AS-OC,Korea,2018,12,22,0,standard_2019,2018,8,28
114287,GRI119,Aqua Patch,2,0.27,0,Trainer,47.0,Zoroark Greninja,1900,Jin Woo Lee,...,AS-OC,Korea,2018,12,22,0,standard_2019,2018,8,28
114288,FLF092,Pal Pad,1,0.68,0,Trainer,47.0,Zoroark Greninja,1900,Jin Woo Lee,...,AS-OC,Korea,2018,12,22,0,standard_2019,2018,8,28
114289,GRI121,Choice Band,2,0.2,0,Trainer,47.0,Zoroark Greninja,1900,Jin Woo Lee,...,AS-OC,Korea,2018,12,22,0,standard_2019,2018,8,28
114290,GRI120,Brooklet Hill,2,0.32,0,Trainer,47.0,Zoroark Greninja,1900,Jin Woo Lee,...,AS-OC,Korea,2018,12,22,0,standard_2019,2018,8,28


#### Tentaremos encontrar subgrupos frequentes de cartas que aparecem em torneios diferentes de batalha de cartas pokemon no ano de 2019

- Definindo todas as regioes do dataset

In [4]:
df = df[df['region_tournament'] != 0]

# eliminando cartas do tipo treinador
df = df[df['type_card'] != 'Trainer']
regions = df['region_tournament'].unique()

regions

array(['SA', 'EU', 'AS-OC', 'JP'], dtype=object)

- Definindo treinadores com suas cartas para cada regiao e realizando apriori para descobrir padroes de cartas utilizadas por jogadores diferentes

In [5]:
# Agrupando cartas de cada jogador no dataset
# Dessa forma, separamos os dados no formato das transações utilizadas no Apriori
def agrupa_jogador_cartas(df):
    dict_jogador_cartas = {}
    for _, row in df.iterrows():
        if row['name_player'] not in dict_jogador_cartas:
            dict_jogador_cartas[row['name_player']] = []
        dict_jogador_cartas[row['name_player']].append(row['name_card'])
    return len(dict_jogador_cartas), dict_jogador_cartas

# Fazendo filtro por região somente no ano de 2019
info_por_regiao = {}
for region in regions:
    print(f"region: {region}")
    df_region = df[(df['region_tournament'] == region) & (df['year_tournament'] == 2019)]
    jogadores = df_region['name_player'].unique()
    cartas = df_region['name_card'].unique()

    quantidade_jogadores, dict_jogador_cartas = agrupa_jogador_cartas(df_region)

    # Cria um dataframe que as colunas sao todas as cartas existentes no dataset
    df_jogadores_cartas = pd.DataFrame(columns=cartas)

    for jogador in jogadores:
        cartas_jogador = dict_jogador_cartas[jogador]
        linha = []
        # Construo matriz de incidencia -> True se o jogador possui a carta, False caso contrario
        for carta in cartas:
            if carta in cartas_jogador:
                linha.append(True)
            else:
                linha.append(False)
        df_jogadores_cartas.loc[len(df_jogadores_cartas)] = linha

    # Apriori

    # Suporte - O quanto a carta aparece no total de cartas usadas
    frequent_itemsets = apriori(df_jogadores_cartas, min_support=0.22, use_colnames=True) # use_colnames -> nome das cartas ao inves de indices
    frequent_itemsets.sort_values(by='support', ascending=False, inplace=True)

    # Regras: Se compro A (antecedente) -> compro B (consequente) com confianca de X%
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4) # confiança das regras geradas -> 40%
    # Lift -> Se a regra é relevante ou não (se é maior que 1, é relevante)
    # Ignoro algumas colunas para facilitar a visualização
    rules = rules.sort_values(by='lift', ascending=False).drop(['antecedent support', 'consequent support', 'leverage', 'conviction', 'zhangs_metric'], axis=1)
    
    info_por_regiao[region] = {'itemsets': frequent_itemsets[:3], 'rules': rules[:3]}

    print(f"Total de jogadores: {quantidade_jogadores}")
    print("Itemsets frequentes")
    print(frequent_itemsets)
    print("Regras")
    print(rules)
    print("\n\n")

region: SA
Total de jogadores: 73
Itemsets frequentes
    support                                    itemsets
0  0.589041                                   (Jirachi)
1  0.493151                                (Dedenne-GX)
2  0.410959              (Mega Lopunny & Jigglypuff-GX)
6  0.342466                       (Jirachi, Dedenne-GX)
3  0.301370               (Arceus & Dialga & Palkia-GX)
7  0.287671  (Dedenne-GX, Mega Lopunny & Jigglypuff-GX)
4  0.246575                                 (Keldeo-GX)
5  0.246575                                 (Cryogonal)
8  0.246575    (Arceus & Dialga & Palkia-GX, Cryogonal)
Regras
                      antecedents                     consequents   support  \
4   (Arceus & Dialga & Palkia-GX)                     (Cryogonal)  0.246575   
5                     (Cryogonal)   (Arceus & Dialga & Palkia-GX)  0.246575   
3  (Mega Lopunny & Jigglypuff-GX)                    (Dedenne-GX)  0.287671   
2                    (Dedenne-GX)  (Mega Lopunny & Jigglypuff-G

### ANALISAR --> PQ AS REGRAS SÃO REPETIDAS?
EX: ANTECEDENTE A - CONSEQUENTE B
ANTECEDENTE B - CONSEQUENTE A

TEM COMO RETIRAR ISSO?

- Analisando os resultados mais relevantes de cada região:

In [6]:
for region in regions:
    print(f"Região: {region}")
    print("Itemsets frequentes")
    print(info_por_regiao[region]['itemsets'])
    print("Regras")
    print(info_por_regiao[region]['rules'])
    print("\n\n")

Região: SA
Itemsets frequentes
    support                        itemsets
0  0.589041                       (Jirachi)
1  0.493151                    (Dedenne-GX)
2  0.410959  (Mega Lopunny & Jigglypuff-GX)
Regras
                      antecedents                    consequents   support  \
4   (Arceus & Dialga & Palkia-GX)                    (Cryogonal)  0.246575   
5                     (Cryogonal)  (Arceus & Dialga & Palkia-GX)  0.246575   
3  (Mega Lopunny & Jigglypuff-GX)                   (Dedenne-GX)  0.287671   

   confidence      lift  
4    0.818182  3.318182  
5    1.000000  3.318182  
3    0.700000  1.419444  



Região: EU
Itemsets frequentes
    support       itemsets
5  0.650602  (Tapu Koko ♢)
0  0.590361      (Jirachi)
7  0.590361    (Marshadow)
Regras
                     antecedents                             consequents  \
337  (Tapu Lele-GX, Tapu Koko ♢)                 (Marshadow, Zeraora-GX)   
334      (Marshadow, Zeraora-GX)             (Tapu Lele-GX, Tapu Kok

In [9]:
print("Media de preço das cartas em todos os torneios em todos os anos" , df['price_card'].mean())
print("Media de preço das cartas em todos os torneios no ano de 2019" , df[(df['year_tournament'] == 2019)]['price_card'].mean())
print("Preço da carta Jirachi em 2019" , df[(df['year_tournament'] == 2019) & (df['name_card'] == 'Jirachi')]['price_card'].mean())

Media de preço das cartas em todos os torneios em todos os anos 2.674447953922616
Media de preço das cartas em todos os torneios no ano de 2019 4.295818992989164
Preço da carta Jirachi em 2019 6.5


In [10]:
print("Media de quantidade máxima de cartas iguais em todos os torneios em todos os anos" , df['amount_card'].mean())
print("Media de preço das cartas em todos os torneios no ano de 2019" , df[(df['year_tournament'] == 2019)]['amount_card'].mean())
print("Preço da carta Jirachi em 2019" , df[(df['year_tournament'] == 2019) & (df['name_card'] == 'Jirachi')]['amount_card'].mean())

Media de quantidade máxima de cartas iguais em todos os torneios em todos os anos 1.8847600910399926
Media de preço das cartas em todos os torneios no ano de 2019 1.751434034416826
Preço da carta Jirachi em 2019 3.1601941747572817


### SA
Jirachi, Dedenne-GX, Mega Lopunny & Jigglypuff-GX
Regras Arceus & Dialga & Palkia-GX e Cryogonal

### EU
Tapu Koko ♢, Jirachi, Marshdown
Regras envolvendo Tapu Lele-GX, Tapu Koko ♢, Marshdown e Zeraora-GX

### AS-OC
Jirachi, Tapu Lele-GX, Marshdown
Destaque para regras enolvendo Tapu Koko ♢, Jirachi, Zapdos e Absol

### JP
Jirachi, Dedenne-GX, Marshdow (destaque para regras envolvendo essa carta)

Percebe-se que a carta Jirachi foi almplamente utilizadas em torneios de 2019 em diferentes regiões ao redor do mundo. Jirachi é uma carta com um alto custo em comparação a média dos valores nos torneios (média de 2.95 contra custo de 6.5 da carta Jirachi em 2019), porém ao mesmo tempo, pode ser mais utilizada em um mesmo deck (3 vezes em comparação a média de 2.17 naquele ano)


--------------------------------------------------------------

# Analise para combos de cartas

In [11]:
df = df_original.copy()

df = df[(df['year_tournament'] == 2019)]


df.replace(np.nan, 0, inplace=True)
df.tail()

Unnamed: 0,id_card,name_card,amount_card,price_card,energy_type_card,type_card,combo_type_id,combo_type_name,id_player,name_player,...,region_tournament,country_tournament,year_tournament,month_tournament,day_tournament,valid_rotation_at_tournament,rotation_name,year_begin,month_begin,day_begin
113994,UNB170,Energy Spinner,1,0.13,0,Trainer,84.0,Zapdos,1889,Sumin Lim,...,AS-OC,Korea,2019,2,16,SUM-TEU,standard_2019,2018,8,28
113995,UPR122a,Escape Board,2,0.0,0,Trainer,84.0,Zapdos,1889,Sumin Lim,...,AS-OC,Korea,2019,2,16,SUM-TEU,standard_2019,2018,8,28
113996,GRI121a,Choice Band,2,19.74,0,Trainer,84.0,Zapdos,1889,Sumin Lim,...,AS-OC,Korea,2019,2,16,SUM-TEU,standard_2019,2018,8,28
113997,CES143,Shrine of Punishment,2,0.33,0,Trainer,84.0,Zapdos,1889,Sumin Lim,...,AS-OC,Korea,2019,2,16,SUM-TEU,standard_2019,2018,8,28
113998,LOT191,Thunder Mountain ♢,1,1.07,0,Trainer,84.0,Zapdos,1889,Sumin Lim,...,AS-OC,Korea,2019,2,16,SUM-TEU,standard_2019,2018,8,28


In [12]:
# If 'presence' column doesn't exist, create it for the BinaryTarget
df['presence'] = (df['name_card'] == 'Jirachi').astype(int)

In [13]:
len(df)

11446

In [14]:
# Define the target for subgroup discovery
target = ps.BinaryTarget('presence', 1)

# Define the search space
combo_selectors = [ps.EqualitySelector('combo_type_name', value) for value in df['combo_type_name'].unique()]
ranking_selector = [ps.EqualitySelector('ranking_player_tournament', value) for value in df['ranking_player_tournament'].unique()]
category_selectors = [ps.EqualitySelector('category_tournament', value) for value in df['category_tournament'].unique()]
region_selectors = [ps.EqualitySelector('region_tournament', value) for value in df['region_tournament'].unique()]
search_space = combo_selectors +  ranking_selector + category_selectors + region_selectors

In [15]:
search_space

[combo_type_name=='Reshiram & Charizard Fire Box',
 combo_type_name=='Mewtwo & Mew',
 combo_type_name=='ADP Keldeo',
 combo_type_name=='Blacephalon Naganadel',
 combo_type_name=='Florges Dolls',
 combo_type_name=='Garchomp & Giratina',
 combo_type_name=='Green's ADP',
 combo_type_name=='Blacephalon Pidgeotto',
 combo_type_name=='Pidgeotto Control',
 combo_type_name=='Gardevoir & Sylveon',
 combo_type_name=='Naganadel & Guzzlord',
 combo_type_name=='Charizard & Braixen',
 combo_type_name=='Malamar Giratina',
 combo_type_name=='Quagsire Silvally',
 combo_type_name=='Pikachu & Zekrom',
 combo_type_name=='Mewtwo & Mew Fire Box',
 combo_type_name=='Quagsire Naganadel',
 combo_type_name=='Charizard',
 combo_type_name=='Blacephalon Silvally',
 combo_type_name=='Zoroark Dewgong',
 combo_type_name=='Blacephalon',
 combo_type_name=='Pikachu & Zekrom Zapdos',
 combo_type_name=='Ultra Malamar ',
 combo_type_name=='Zapdos',
 combo_type_name=='Lucario & Melmetal Vileplume',
 combo_type_name=='Spirit

In [16]:

task = ps.SubgroupDiscoveryTask(df, target, search_space, result_set_size=15, depth=5, qf=ps.WRAccQF())

# Run the SD-Map algorithm
result = ps.SimpleDFS().execute(task)


In [17]:
# Display the results
for row in result.to_dataframe().itertuples():
    print(row)

Pandas(Index=0, quality=0.001806979198588307, subgroup=(combo_type_name=='Ultra Malamar '), size_sg=857, size_dataset=11446, positives_sg=40, positives_dataset=258, size_complement=10589, relative_size_sg=0.07487331818976062, relative_size_complement=0.9251266818102394, coverage_sg=0.15503875968992248, coverage_complement=0.8449612403100775, target_share_sg=0.046674445740956826, target_share_complement=0.02058740202096515, target_share_dataset=0.022540625546042286, lift=2.0706810308177976)
Pandas(Index=1, quality=0.0017953465801277201, subgroup=(combo_type_name=='Zapdos'), size_sg=996, size_dataset=11446, positives_sg=43, positives_dataset=258, size_complement=10450, relative_size_sg=0.0870172986196051, relative_size_complement=0.912982701380395, coverage_sg=0.16666666666666666, coverage_complement=0.8333333333333334, target_share_sg=0.04317269076305221, target_share_complement=0.02057416267942584, target_share_dataset=0.022540625546042286, lift=1.9153279785809907)
Pandas(Index=2, qual

### A ideia é achar subgrupos ligados a carta Jirachi - Mas pela qualidade não deu muito certo

-----------------------------------------------------------------

- Teste top cartas


In [150]:
df = df_original.copy()

df = df[(df['ranking_player_tournament'] != np.nan)]
#df.replace(np.nan, 0, inplace=True)
df.columns

Index(['id_card', 'name_card', 'amount_card', 'price_card', 'energy_type_card',
       'type_card', 'combo_type_id', 'combo_type_name', 'id_player',
       'name_player', 'country_player', 'all_time_score',
       'ranking_player_tournament', 'id_tournament', 'category_tournament',
       'name_tournament', 'region_tournament', 'country_tournament',
       'year_tournament', 'month_tournament', 'day_tournament',
       'valid_rotation_at_tournament', 'rotation_name', 'year_begin',
       'month_begin', 'day_begin'],
      dtype='object')

In [151]:
df.describe

<bound method NDFrame.describe of         id_card         name_card  amount_card  price_card energy_type_card  \
0        SSP272          Archeops            4        0.70        Colorless   
1        SIT138           Lugia V            3        6.20        Colorless   
2        SIT139       Lugia VSTAR            3        8.71        Colorless   
3        LOR143           Snorlax            3        0.93        Colorless   
4        SSP250        Lumineon V            2        1.90            Water   
...         ...               ...          ...         ...              ...   
114286  GRI130a  Rescue Stretcher            2        0.72              NaN   
114287   GRI119        Aqua Patch            2        0.27              NaN   
114288   FLF092           Pal Pad            1        0.68              NaN   
114289   GRI121       Choice Band            2        0.20              NaN   
114290   GRI120     Brooklet Hill            2        0.32              NaN   

       type_card 

In [152]:
df = df[['name_card', 'amount_card', 'price_card', 'energy_type_card', 'type_card', 'name_player', 'all_time_score', 'category_tournament' ,'ranking_player_tournament', 'combo_type_name', 'region_tournament', 'valid_rotation_at_tournament']]


In [153]:
df = df.dropna(subset=['name_card'
,'energy_type_card'
,'name_card'
,'combo_type_name'
,'category_tournament'
,'region_tournament'
,'all_time_score'
,'valid_rotation_at_tournament'])

df.describe

<bound method NDFrame.describe of            name_card  amount_card  price_card energy_type_card type_card  \
7462      Charmander            4        4.15             Fire   Pokémon   
7463      Charmeleon            1        0.08             Fire   Pokémon   
7464    Charizard ex            3       19.82         Darkness   Pokémon   
7465          Pidgey            2        0.09        Colorless   Pokémon   
7466      Pidgeot ex            2        6.51        Colorless   Pokémon   
...              ...          ...         ...              ...       ...   
113978   Tapu Koko ♢            1        0.91        Lightning   Pokémon   
113979         Eevee            1        8.44        Colorless   Pokémon   
113980    Jolteon-GX            1        3.11               GX   Pokémon   
113981         Absol            1        0.99         Darkness   Pokémon   
113982     Marshadow            1        1.02          Psychic   Pokémon   

            name_player  all_time_score category_tour

In [154]:
df['top_position'] = (df['ranking_player_tournament'] <= 15).astype(int)

# Define the target for subgroup discovery
target = ps.BinaryTarget('top_position', 1)

# Define selectors for relevant features

name_selectors = [ps.EqualitySelector('name_card', value) for value in df['name_card'].unique()]
energy_selectors = [ps.EqualitySelector('energy_type_card', value) for value in df['energy_type_card'].unique()]
card_name_selector = [ps.EqualitySelector('name_card', value) for value in df['name_card'].unique()]
combo_type_selector = [ps.EqualitySelector('combo_type_name', value) for value in df['combo_type_name'].unique()]
category_selector = [ps.EqualitySelector('category_tournament', value) for value in df['category_tournament'].unique()]
region_selector = [ps.EqualitySelector('region_tournament', value) for value in df['region_tournament'].unique()]
all_score_selector = [ps.EqualitySelector('all_time_score', value) for value in df['all_time_score'].unique()]
rotation_selector = [ps.EqualitySelector('valid_rotation_at_tournament', value) for value in df['valid_rotation_at_tournament'].unique()]

# Combine all selectors to form the search space
search_space =  energy_selectors + name_selectors +  card_name_selector + combo_type_selector + category_selector + region_selector + all_score_selector + rotation_selector

#search_space = ps.create_selectors(df, ignore=['top_position'])

# Create the Subgroup Discovery Task
task = ps.SubgroupDiscoveryTask(
    df, 
    target, 
    search_space, 
    result_set_size=10, 
    depth=5, 
    qf=ps.WRAccQF()  # Quality function to evaluate subgroups
)

# Run the SD-Map algorithm
result = ps.SimpleDFS().execute(task)

df_result = result.to_dataframe()

df_result



Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.045982,category_tournament=='others',3937,17340,1765,4262,13403,0.227047,0.772953,0.414125,0.585875,0.448311,0.186302,0.24579,1.823958
1,0.036366,region_tournament=='AS-OC',3407,17340,1468,4262,13933,0.196482,0.803518,0.344439,0.655561,0.430878,0.200531,0.24579,1.753031
2,0.026384,category_tournament=='others' AND region_tourn...,889,17340,676,4262,16451,0.051269,0.948731,0.158611,0.841389,0.760405,0.217981,0.24579,3.093717
3,0.019138,category_tournament=='others' AND region_tourn...,440,17340,440,4262,16900,0.025375,0.974625,0.103238,0.896762,1.0,0.226154,0.24579,4.068512
4,0.012103,region_tournament=='AS-OC' AND valid_rotation_...,1335,17340,538,4262,16005,0.07699,0.92301,0.126232,0.873768,0.402996,0.232677,0.24579,1.639595
5,0.011178,category_tournament=='others' AND valid_rotati...,257,17340,257,4262,17083,0.014821,0.985179,0.0603,0.9397,1.0,0.234444,0.24579,4.068512
6,0.011067,category_tournament=='others' AND valid_rotati...,749,17340,376,4262,16591,0.043195,0.956805,0.088221,0.911779,0.502003,0.234223,0.24579,2.042404
7,0.009619,region_tournament=='SA',4346,17340,1235,4262,12994,0.250634,0.749366,0.28977,0.71023,0.284169,0.232954,0.24579,1.156147
8,0.009587,category_tournament=='others' AND energy_type_...,768,17340,355,4262,16572,0.044291,0.955709,0.083294,0.916706,0.46224,0.235759,0.24579,1.880627
9,0.008743,category_tournament=='others' AND region_tourn...,201,17340,201,4262,17139,0.011592,0.988408,0.047161,0.952839,1.0,0.236945,0.24579,4.068512


- Como possível ver, muita redundancia nos subgrupos do beam search

In [155]:
# Display the results
for row in result.to_dataframe().itertuples():
    print(row.subgroup)

category_tournament=='others'
region_tournament=='AS-OC'
category_tournament=='others' AND region_tournament=='AS-OC'
category_tournament=='others' AND region_tournament=='SA'
region_tournament=='AS-OC' AND valid_rotation_at_tournament=='SSH-SIT'
category_tournament=='others' AND valid_rotation_at_tournament=='UPR-CEC'
category_tournament=='others' AND valid_rotation_at_tournament=='BST-SVI'
region_tournament=='SA'
category_tournament=='others' AND energy_type_card=='Psychic'
category_tournament=='others' AND region_tournament=='SA' AND valid_rotation_at_tournament=='UPR-CEC'


# PRECISA ANALISAR RESULTADO

Tentando rodar com DFS pra ver se aumenta confiança

In [13]:
df['top_position'] = (df['ranking_player_tournament'] <= 3).astype(int)

# Define the target for subgroup discovery
target = ps.BinaryTarget('top_position', 1)

# Define selectors for relevant features
namep_selectors = [ps.EqualitySelector('name_player', value) for value in df['name_player'].unique()]
energy_selectors = [ps.EqualitySelector('energy_type_card', value) for value in df['energy_type_card'].unique()]
type_selectors = [ps.EqualitySelector('type_card', value) for value in df['type_card'].unique()]
name_selectors = [ps.EqualitySelector('name_card', value) for value in df['name_card'].unique()]

# Combine all selectors to form the search space
search_space = namep_selectors + energy_selectors + name_selectors + type_selectors

search_space = ps.create_selectors(df, ignore=['top_position'])

# Create the Subgroup Discovery Task
task = ps.SubgroupDiscoveryTask(
    df, 
    target, 
    search_space, 
    result_set_size=10, 
    depth=6, 
    qf=ps.WRAccQF()  # Quality function to evaluate subgroups
)

# Run the SD-Map algorithm
result = ps.SimpleDFS().execute(task)

df_result = result.to_dataframe()

df_result

# Display the results
#for row in result.to_dataframe().itertuples():
    #print(row.subgroup)

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.00347,energy_type_card=='GX',3140,42447,376,3092,39307,0.073975,0.926025,0.121604,0.878396,0.119745,0.069097,0.072844,1.643863
1,0.00347,energy_type_card=='GX' AND type_card=='Pokémon',3140,42447,376,3092,39307,0.073975,0.926025,0.121604,0.878396,0.119745,0.069097,0.072844,1.643863
2,0.002838,energy_type_card=='EX',927,42447,188,3092,41520,0.021839,0.978161,0.060802,0.939198,0.202805,0.069942,0.072844,2.784105
3,0.002838,energy_type_card=='EX' AND type_card=='Pokémon',927,42447,188,3092,41520,0.021839,0.978161,0.060802,0.939198,0.202805,0.069942,0.072844,2.784105
4,0.001647,name_player=='Tord Reklev',262,42447,89,3092,42185,0.006172,0.993828,0.028784,0.971216,0.339695,0.071186,0.072844,4.663331
5,0.001647,name_player=='Tord Reklev' AND type_card=='Pok...,262,42447,89,3092,42185,0.006172,0.993828,0.028784,0.971216,0.339695,0.071186,0.072844,4.663331
6,0.00126,name_player=='Stéphane Ivanoff',158,42447,65,3092,42289,0.003722,0.996278,0.021022,0.978978,0.411392,0.071579,0.072844,5.647598
7,0.00126,name_player=='Stéphane Ivanoff' AND type_card=...,158,42447,65,3092,42289,0.003722,0.996278,0.021022,0.978978,0.411392,0.071579,0.072844,5.647598
8,0.000782,name_player=='Regan Retzloff',107,42447,41,3092,42340,0.002521,0.997479,0.01326,0.98674,0.383178,0.07206,0.072844,5.260265
9,0.000782,name_player=='Regan Retzloff' AND type_card=='...,107,42447,41,3092,42340,0.002521,0.997479,0.01326,0.98674,0.383178,0.07206,0.072844,5.260265


### Teste pra encontrar subgrupos de jogadores que melhoraram a pontuação no mesmo ano


In [83]:
df = df_original.copy()

years = df['year_tournament'].unique()

years

array([2023, 2022, 2018, 2016, 2015, 2014, 2013, 2020, 2019, 2017, 2012,
       2011, 2021])

In [72]:
for y in years:

    print(y, ': ', df[df['year_tournament'] == y].shape[0])

2023 :  70158
2022 :  14660
2018 :  6718
2016 :  1941
2015 :  1606
2014 :  960
2013 :  498
2020 :  2766
2019 :  11446
2017 :  2641
2012 :  115
2011 :  175
2021 :  607


In [99]:
# Create a dictionary of scores for 2022
scores_2022 = df[df['year_tournament'] == 2022].set_index('name_player')['all_time_score'].to_dict()

# Define a function to determine if a player's score improved
def has_improved(row):
    if row['year_tournament'] == 2023:
        previous_score = scores_2022.get(row['name_player'])
        if previous_score is not None:
            return row['all_time_score'] < previous_score
    return False

# Apply the function to create the new column
df['improved_2022_2023'] = df.apply(has_improved, axis=1)

df[df['improved_2022_2023'] == True][['name_player', 'year_tournament', 'all_time_score']]

Unnamed: 0,name_player,year_tournament,all_time_score
68172,Owyn Kamerman,2022,228
68173,Owyn Kamerman,2022,228
68174,Owyn Kamerman,2022,228
68175,Owyn Kamerman,2022,228
68176,Owyn Kamerman,2022,228
68177,Owyn Kamerman,2022,228
68178,Owyn Kamerman,2022,228
68179,Owyn Kamerman,2022,228
68180,Owyn Kamerman,2022,228
68181,Owyn Kamerman,2022,228


In [67]:
# see the unique names of players that improved their score
df[df['improved_2022_2023'] == True]['name_player'].unique()

array(['Owyn Kamerman', 'Fabien Pujol', 'Alberto Conti'], dtype=object)

In [90]:

# Define the target for subgroup discovery
target = ps.BinaryTarget('improved_2022_2023', True)

search_space = ps.create_selectors(df, ignore=['improved_2022_2023'])

# create one selector for card_name, combo_type_name, category_tournament, region_tournament and all_time_score
card_name_selector = [ps.EqualitySelector('name_card', value) for value in df['name_card'].unique()]
combo_type_selector = [ps.EqualitySelector('combo_type_name', value) for value in df['combo_type_name'].unique()]
category_selector = [ps.EqualitySelector('category_tournament', value) for value in df['category_tournament'].unique()]
region_selector = [ps.EqualitySelector('region_tournament', value) for value in df['region_tournament'].unique()]
all_score_selector = [ps.EqualitySelector('all_time_score', value) for value in df['all_time_score'].unique()]
rotation_selector = [ps.EqualitySelector('valid_rotation_at_tournament', value) for value in df['valid_rotation_at_tournament'].unique()]


# Combine all selectors to form the search space
search_space = card_name_selector + combo_type_selector + category_selector + region_selector + all_score_selector + rotation_selector

# Create the Subgroup Discovery Task
task = ps.SubgroupDiscoveryTask(
    df, 
    target, 
    search_space, 
    result_set_size=7, 
    depth=4, 
    qf=ps.WRAccQF()  # Quality function to evaluate subgroups
)

# Run the SD-Map algorithm
result = ps.SimpleDFS().execute(task)

df_result = result.to_dataframe()

df_result

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.000861,all_time_score==310,305,114291,99,218,113986,0.002669,0.997331,0.454128,0.545872,0.32459,0.001044,0.001907,170.173094
1,0.000853,all_time_score==228,269,114291,98,218,114022,0.002354,0.997646,0.449541,0.550459,0.364312,0.001052,0.001907,190.998227
2,0.000715,category_tournament=='regional' AND region_tou...,6469,114291,94,218,107822,0.056601,0.943399,0.431193,0.568807,0.014531,0.00115,0.001907,7.618092
3,0.000533,category_tournament=='regional' AND region_tou...,5310,114291,71,218,108981,0.04646,0.95354,0.325688,0.674312,0.013371,0.001349,0.001907,7.010022
4,0.000505,valid_rotation_at_tournament=='BST-PAL',8564,114291,74,218,105727,0.074932,0.925068,0.33945,0.66055,0.008641,0.001362,0.001907,4.530129
5,0.000504,region_tournament=='EU' AND valid_rotation_at_...,7012,114291,71,218,107279,0.061352,0.938648,0.325688,0.674312,0.010125,0.00137,0.001907,5.308502
6,0.000437,all_time_score==310 AND category_tournament=='...,50,114291,50,218,114241,0.000437,0.999563,0.229358,0.770642,1.0,0.001471,0.001907,524.270642


In [88]:
# Display the results
for row in result.to_dataframe().itertuples():
    print(row.subgroup)

category_tournament=='regional' AND region_tournament=='EU'
region_tournament=='EU'
category_tournament=='worlds' AND region_tournament=='JP'
combo_type_name=='Mew Genesect'
category_tournament=='worlds'
region_tournament=='JP'
combo_type_name=='Palkia Gardevoir' AND region_tournament=='JP'


## Resultado ta muito ruim e não retornou nada de útil