# 03 - Recommender System

In [1]:
# imports
import pandas as pd
import numpy as np
import time

from scipy import sparse # cut down on memory size
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

pd.options.display.max_columns = 35

___

In [2]:
# read in the data
df = pd.read_csv('../Data/cards_cleaned.csv')
df.head()

Unnamed: 0,name,layout,colors,color_identity,mana_cost,cmc,type_line,card_type,super_type,sub_type,oracle_text,oracle_text_token,legalities,rarity,power,toughness,loyalty,card_faces,activated_ability,triggered_ability,oracle_text_back,oracle_text_back_token,colors_back,power_back,toughness_back,loyalty_back,card_type_back,super_type_back,sub_type_back,mana_cost_back
0,Static Orb,normal,[],[],{3},3.0,Artifact,Artifact,NONE,NONE,"As long as Static Orb is untapped, players can...",as long as static orb is untapped players can'...,legacy vintage commander duel,rare,NONE,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
1,Sensory Deprivation,normal,['U'],['U'],{U},1.0,Enchantment — Aura,Enchantment,NONE,Aura,Enchant creature Enchanted creature gets -3/-0.,enchant creature enchanted creature gets -3/-0,pioneer modern legacy pauper vintage penny com...,common,NONE,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
2,Road of Return,normal,['G'],['G'],{G}{G},2.0,Sorcery,Sorcery,NONE,NONE,Choose one — • Return target permanent card fr...,choose one return target permanent card from y...,legacy vintage commander duel,rare,NONE,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
3,Storm Crow,normal,['U'],['U'],{1}{U},2.0,Creature — Bird,Creature,NONE,Bird,Flying (This creature can't be blocked except ...,flying this creature can't be blocked except b...,modern legacy pauper vintage penny commander duel,common,1,2,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
4,Walking Sponge,normal,['U'],['U'],{1}{U},2.0,Creature — Sponge,Creature,NONE,Sponge,{T}: Target creature loses your choice of flyi...,{t} target creature loses your choice of flyin...,legacy vintage commander duel,uncommon,1,1,NONE,NONE,1.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE


In [3]:
df.shape

(18108, 30)

___

## Oracle text recommender system
To start I want to build a recommender system that just looks at oracle text. In order to do that I need to combine both oracle_text and oracle_text_back into one single column

In [4]:
df['oracle_combined'] = df['oracle_text_token'] + " " + df['oracle_text_back_token']

In [5]:
# start by isolating the name of the card and it's oracle_text
oracle = df['oracle_combined']

# vectorize all our words
cvec = CountVectorizer(stop_words=['none'],
                      min_df=2,
                      max_df=.99,
                      ngram_range=(1,6),
                      token_pattern="[a-zA-Z{}+'0-9-/−]+") # we should use the same RegEx to keep certain characters together 

oracle_vec = cvec.fit_transform(oracle)

# convert to a dataframe so we can use this later on as well
converted_df = pd.DataFrame(oracle_vec.toarray(), columns=cvec.get_feature_names(), index=df['name'])

In [6]:
converted_df.shape

(18108, 130070)

This is a whole lot of features to examine so I'm going to use Variance Threshold to drop some n_gram columns that only exist in a small number of cards

In [7]:
selector = VarianceThreshold(.005)
new_array = selector.fit_transform(converted_df)

new_array.shape

(18108, 2652)

In [8]:
# get the list of columns kept
vt_list = converted_df.columns[selector.get_support()]

# update our converted_df with just the important columns
converted_df = converted_df[vt_list]

For the recommender system to work efficiently we need to convert the data back into a sparse matrix with the new names as the index

In [9]:
sparse_df = sparse.csr_matrix(converted_df)

In [10]:
# build the recommender system using cosine similarity
rec = pairwise_distances(sparse_df, metric='cosine')

In [11]:
rec.shape

(18108, 18108)

In [12]:
rec_df = pd.DataFrame(rec, index=converted_df.index, columns=converted_df.index)
rec_df.head()

name,Static Orb,Sensory Deprivation,Road of Return,Storm Crow,Walking Sponge,Ravnica at War,Torrent of Fire,Pteramander,Nantuko Elder,Vedalken Heretic,Waterknot,Ruthless Knave,"Hua Tuo, Honored Physician",Veil of Summer,Disposal Mummy,Marang River Prowler,Aura Graft,...,Trinket Mage,Skyshroud Blessing,"Omnath, Locus of the Roil",Harvest Hand // Scrounged Scythe,Polis Crusher,Test of Endurance,Venom Sliver,Borderland Ranger,Curse of Thirst,Temporary Truce,Clearwater Goblet,Quarry Beetle,Devoted Hero,Without Weakness,Firesong and Sunspeaker,"Samut, the Tested",Sinew Sliver
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
Static Orb,0.0,1.0,1.0,0.968901,1.0,0.8934,0.963963,1.0,1.0,1.0,0.849244,0.97151,0.973556,0.950055,1.0,0.707146,1.0,...,1.0,0.967102,0.983884,0.738884,0.976013,0.968218,0.968901,1.0,1.0,0.907681,1.0,1.0,1.0,0.97801,1.0,0.922472,1.0
Sensory Deprivation,1.0,0.0,1.0,0.929245,0.91556,1.0,1.0,0.946757,1.0,1.0,0.519804,0.93518,0.939834,1.0,1.0,1.0,0.95778,...,1.0,1.0,1.0,0.851478,0.945425,1.0,0.85849,1.0,0.939366,1.0,1.0,1.0,1.0,0.949969,0.941176,0.911805,1.0
Road of Return,1.0,1.0,0.0,1.0,0.903882,1.0,0.869336,0.878786,1.0,0.933741,0.968765,0.970486,0.671264,0.862028,0.797322,0.802804,0.903882,...,0.697391,0.94888,0.808,0.87602,0.913028,0.802454,0.951676,0.716994,0.903372,0.968121,0.873602,0.611822,1.0,0.886099,0.933041,0.789176,1.0
Storm Crow,0.968901,0.929245,1.0,0.0,0.898433,1.0,1.0,0.903935,1.0,1.0,0.938115,0.941524,0.963815,0.874708,1.0,0.839711,1.0,...,0.972434,0.90997,0.988974,0.717142,0.950767,0.978256,0.914894,1.0,1.0,0.978946,0.974315,1.0,1.0,0.954866,0.946934,0.960219,0.944868
Walking Sponge,1.0,0.91556,0.903882,0.898433,0.0,1.0,0.911727,0.942677,0.899496,1.0,0.926145,0.906952,0.784083,0.74173,0.960064,0.952177,0.909091,...,0.934205,0.623949,0.894728,0.928933,0.941244,0.8962,0.898433,0.950432,0.869442,1.0,0.892713,0.954674,1.0,0.730679,0.89445,0.69932,1.0


In [13]:
# now to test the recommender system
rec_df['Shock'].sort_values()[0:11]

name
Tarfire              0.000000
Unyaro Bee Sting     0.000000
Shock                0.000000
Magma Jet            0.039841
Moonglove Extract    0.064586
Shock Troops         0.064586
Perilous Myr         0.083485
Seal of Fire         0.083485
Ember Hauler         0.083485
Crackling Triton     0.083485
Scaldkin             0.101283
Name: Shock, dtype: float64

In [14]:
rec_df['Static Orb'].sort_values()[0:11]

name
Static Orb          0.000000
Winter Orb          0.090909
Imi Statue          0.384543
Damping Field       0.384543
Castle Raptors      0.430197
Stoic Angel         0.430197
Giant Tortoise      0.446088
Kill Switch         0.455051
Storage Matrix      0.476861
Wardscale Dragon    0.497481
Halam Djinn         0.510990
Name: Static Orb, dtype: float64

In [15]:
rec_df['Prized Amalgam'].sort_values()[0:11]

name
Prized Amalgam            0.000000
Etherium Abomination      0.304019
Wake the Dead             0.305999
Hell's Thunder            0.307745
Footsteps of the Goryo    0.309909
Kederekt Leviathan        0.311532
Fire-Field Ogre           0.312159
Shambling Remains         0.312159
Dregscape Zombie          0.314810
Hellspark Elemental       0.316281
Kathari Screecher         0.317430
Name: Prized Amalgam, dtype: float64

In [16]:
rec_df['Wrath of God'].sort_values()[0:11]

name
Wrath of God               0.000000e+00
Damnation                  1.110223e-16
Perish                     7.142857e-02
Winds of Rath              9.251479e-02
Shatterstorm               1.105008e-01
Plague Wind                1.835034e-01
Abu Ja'far                 2.022760e-01
Obliterate                 2.059333e-01
Retribution of the Meek    2.198105e-01
Child of Alara             2.440711e-01
Catastrophe                2.592563e-01
Name: Wrath of God, dtype: float64

In [17]:
rec_df['Grizzly Bears'].sort_values()[0:11] # vanilla creature

name
Grizzly Bears                   0.0
Static Orb                      1.0
Doomgape                        1.0
Ghitu Chronicler                1.0
Aerial Volley                   1.0
Explosive Apparatus             1.0
Spontaneous Mutation            1.0
Rakdos Locket                   1.0
Voltaic Brawler                 1.0
Jousting Lance                  1.0
Tatsumasa, the Dragon's Fang    1.0
Name: Grizzly Bears, dtype: float64

This is a great start! Now I want to add the numerical features and see the results
___

## Adding numerical data to our features

In [18]:
df.dtypes

name                       object
layout                     object
colors                     object
color_identity             object
mana_cost                  object
cmc                       float64
type_line                  object
card_type                  object
super_type                 object
sub_type                   object
oracle_text                object
oracle_text_token          object
legalities                 object
rarity                     object
power                      object
toughness                  object
loyalty                    object
card_faces                 object
activated_ability         float64
triggered_ability         float64
oracle_text_back           object
oracle_text_back_token     object
colors_back                object
power_back                 object
toughness_back             object
loyalty_back               object
card_type_back             object
super_type_back            object
sub_type_back              object
mana_cost_back

In [19]:
# we already have a df for the oracle text so we can just use that one and add to it.
converted_df['cmc'] = df['cmc'].values
converted_df['activated_ability'] = df['activated_ability'].values
converted_df['triggered_ability'] = df['triggered_ability'].values

# convert to a sparse matrix
sparse_df = sparse.csr_matrix(converted_df)

# build the recommender system using cosine similarity
rec = pairwise_distances(sparse_df, metric='cosine')

# turn into a dataframe for interpretability
rec_df = pd.DataFrame(rec, index=converted_df.index, columns=converted_df.index)
rec_df.head()

name,Static Orb,Sensory Deprivation,Road of Return,Storm Crow,Walking Sponge,Ravnica at War,Torrent of Fire,Pteramander,Nantuko Elder,Vedalken Heretic,Waterknot,Ruthless Knave,"Hua Tuo, Honored Physician",Veil of Summer,Disposal Mummy,Marang River Prowler,Aura Graft,...,Trinket Mage,Skyshroud Blessing,"Omnath, Locus of the Roil",Harvest Hand // Scrounged Scythe,Polis Crusher,Test of Endurance,Venom Sliver,Borderland Ranger,Curse of Thirst,Temporary Truce,Clearwater Goblet,Quarry Beetle,Devoted Hero,Without Weakness,Firesong and Sunspeaker,"Samut, the Tested",Sinew Sliver
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
Static Orb,0.0,0.873,0.883796,0.823952,0.825185,0.477907,0.629009,0.941557,0.551678,0.803252,0.675383,0.778921,0.79261,0.916106,0.699833,0.566612,0.822838,...,0.853654,0.814631,0.831495,0.633659,0.76293,0.703471,0.823952,0.85305,0.716019,0.775839,0.783606,0.707786,0.461184,0.873643,0.684502,0.755376,0.675082
Sensory Deprivation,0.873,0.0,0.949167,0.86798,0.847056,0.789181,0.847855,0.923304,0.803884,0.913934,0.482706,0.854935,0.863917,0.981651,0.868694,0.910197,0.883752,...,0.935982,0.930495,0.931959,0.793959,0.856408,0.880263,0.80197,0.935718,0.826084,0.934628,0.90534,0.872173,0.764298,0.905244,0.815983,0.839486,0.857866
Road of Return,0.883796,0.949167,0.0,0.939602,0.842565,0.807103,0.76334,0.859647,0.820555,0.862188,0.888631,0.893814,0.626456,0.848894,0.719663,0.739799,0.840452,...,0.658305,0.888706,0.758753,0.821948,0.835769,0.726104,0.894303,0.676502,0.806768,0.910278,0.800789,0.567244,0.784334,0.848274,0.821102,0.733799,0.869949
Storm Crow,0.823952,0.86798,0.939602,0.0,0.818276,0.74951,0.819225,0.878495,0.766979,0.897738,0.837302,0.844874,0.870648,0.858285,0.843985,0.75103,0.907918,...,0.89858,0.834832,0.909049,0.659982,0.843605,0.839948,0.843137,0.923621,0.852398,0.902908,0.865032,0.848118,0.719944,0.901487,0.79502,0.86888,0.7889
Walking Sponge,0.825185,0.847056,0.842565,0.818276,0.0,0.709809,0.727745,0.894428,0.640063,0.88153,0.811516,0.780351,0.681561,0.734793,0.789134,0.835183,0.813317,...,0.853132,0.569472,0.812683,0.842437,0.802347,0.752774,0.818276,0.867273,0.726406,0.910016,0.778491,0.788855,0.675557,0.673923,0.73087,0.613342,0.804354


In [20]:
# now to test the recommender system
rec_df['Shock'].sort_values()[0:11]

name
Shock                0.000000e+00
Tarfire              4.440892e-16
Magma Jet            4.983090e-02
Seal of Fire         9.732907e-02
Ember Hauler         1.047262e-01
Perilous Myr         1.047262e-01
Arc Trail            1.221543e-01
Moonglove Extract    1.224731e-01
Unyaro Bee Sting     1.237505e-01
Orcish Vandal        1.331549e-01
Crackling Triton     1.351000e-01
Name: Shock, dtype: float64

In [21]:
rec_df['Static Orb'].sort_values()[0:11]

name
Static Orb          0.000000
Winter Orb          0.084189
Damping Field       0.255331
Imi Statue          0.255331
Stoic Angel         0.278592
Castle Raptors      0.281003
Wardscale Dragon    0.315647
Halam Djinn         0.321904
Zanam Djinn         0.321904
Sulam Djinn         0.321904
Giant Tortoise      0.326480
Name: Static Orb, dtype: float64

In [22]:
rec_df['Prized Amalgam'].sort_values()[0:11]

name
Prized Amalgam            0.000000
Etherium Abomination      0.286393
Footsteps of the Goryo    0.287127
Wake the Dead             0.289008
Fire-Field Ogre           0.291475
Hell's Thunder            0.292998
Shambling Remains         0.293961
Kathari Screecher         0.298950
Dregscape Zombie          0.303927
Kederekt Leviathan        0.306657
Brackwater Elemental      0.309082
Name: Prized Amalgam, dtype: float64

In [23]:
rec_df['Wrath of God'].sort_values()[0:11]

name
Wrath of God               0.000000e+00
Damnation                  2.220446e-16
Winds of Rath              4.215851e-02
Perish                     4.826627e-02
Shatterstorm               5.071109e-02
Obliterate                 9.126107e-02
Plague Wind                9.612309e-02
Catastrophe                1.129929e-01
Child of Alara             1.192290e-01
Retribution of the Meek    1.608536e-01
Day of Judgment            1.633400e-01
Name: Wrath of God, dtype: float64

In [24]:
rec_df['Jace, the Mind Sculptor'].sort_values()[0:11]

name
Jace, the Mind Sculptor    0.000000
Voyage's End               0.331764
Coral Fighters             0.338409
Anchor to the Aether       0.351623
Select for Inspection      0.355678
Cavalier of Gales          0.366878
Brainstorm                 0.370717
Petals of Insight          0.372045
Riverwise Augur            0.377188
Scroll Rack                0.388868
Spin into Myth             0.391776
Name: Jace, the Mind Sculptor, dtype: float64

In [25]:
rec_df['Grizzly Bears'].sort_values()[0:11] # vanilla creature

name
Silverflame Squire // On Alert    0.0
Panther Warriors                  0.0
River Kaijin                      0.0
Dutiful Servants                  0.0
Loxodon Line Breaker              0.0
Craw Wurm                         0.0
Rhox Brute                        0.0
Bogstomper                        0.0
Earth Elemental                   0.0
Orazca Frillback                  0.0
Vorstclaw                         0.0
Name: Grizzly Bears, dtype: float64

___

## Non-numerical data
Now I have to convert the non-numerical data into numerical data

### Power / Toughness / Loyalty

Through some outside reasearch on ScryFall and how other sources interpret * s in power and toughness, I'm going to impute the * s as 0 and * +1s as 1 for both power and toughness. Same is true for loyalty

In [26]:
# power
zero_power_index = df.loc[(df['power'] == '*')].index
zero_power_back_index = df.loc[(df['power_back'] == '*')].index
df.loc[zero_power_index, 'power'] = 0
df.loc[zero_power_back_index, 'power_back'] = 0

one_power_index = df.loc[(df['power'] == '1+*')].index
df.loc[one_power_index, 'power'] = 1

# toughness
zero_toughness_index = df.loc[(df['toughness'] == '*')].index
zero_toughness_back_index = df.loc[(df['toughness_back'] == '*')].index
df.loc[zero_toughness_index, 'toughness'] = 0
df.loc[zero_toughness_back_index, 'toughness_back'] = 0

one_toughness_index = df.loc[(df['toughness'] == '1+*')].index
df.loc[one_toughness_index, 'toughness'] = 1

# loyalty
zero_loyalty_index = df.loc[(df['loyalty'] == 'X')].index
zero_loyalty_back_index = df.loc[(df['loyalty_back'] == 'X')].index
df.loc[zero_loyalty_index, 'loyalty'] = 0
df.loc[zero_loyalty_back_index, 'loyalty_back'] = 0

# # convert those columns to numerical data
# df['power_back'] = df['power_back'].astype(float)
# df['power'] = df['power'].astype(float)
# df['toughness'] = df['toughness'].astype(float)
# df['toughness_back'] = df['toughness_back'].astype(float)
# df['loyalty'] = df['loyalty'].astype(float)
# df['loyalty_back'] = df['loyalty_back'].astype(float)

In [27]:
df.dtypes

name                       object
layout                     object
colors                     object
color_identity             object
mana_cost                  object
cmc                       float64
type_line                  object
card_type                  object
super_type                 object
sub_type                   object
oracle_text                object
oracle_text_token          object
legalities                 object
rarity                     object
power                      object
toughness                  object
loyalty                    object
card_faces                 object
activated_ability         float64
triggered_ability         float64
oracle_text_back           object
oracle_text_back_token     object
colors_back                object
power_back                 object
toughness_back             object
loyalty_back               object
card_type_back             object
super_type_back            object
sub_type_back              object
mana_cost_back

In [28]:
# add the newly made numerical columns to the converted df
converted_df['power'] = df['power'].values
converted_df['power_back'] = df['power_back'].values
converted_df['toughness'] = df['toughness'].values
converted_df['toughness_back'] = df['toughness_back'].values
converted_df['loyalty'] = df['loyalty'].values
converted_df['loyalty_back'] = df['loyalty_back'].values

In [29]:
converted_df.head()

Unnamed: 0_level_0,+1,+1/+0,+1/+0 until,+1/+0 until end,+1/+0 until end of,+1/+0 until end of turn,+1/+1,+1/+1 and,+1/+1 and has,+1/+1 counter,+1/+1 counter from,+1/+1 counter on,+1/+1 counter on each,+1/+1 counter on it,+1/+1 counter on target,+1/+1 counter on target creature,+1/+1 counters,...,{t} add {g},{t} discard,{t} put,{t} sacrifice,{t} tap,{t} target,{t} target creature,{u},{w},{x},cmc,activated_ability,triggered_ability,power_back,toughness_back,loyalty,loyalty_back
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
Static Orb,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,3.0,0.0,0.0,NONE,NONE,NONE,NONE
Sensory Deprivation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1.0,0.0,0.0,NONE,NONE,NONE,NONE
Road of Return,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,2.0,0.0,0.0,NONE,NONE,NONE,NONE
Storm Crow,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,2.0,0.0,0.0,NONE,NONE,NONE,NONE
Walking Sponge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0,2.0,1.0,0.0,NONE,NONE,NONE,NONE


___

In [30]:
# set the index of df to be the same as the converted_df; the name of the card
df = df.set_index(df['name']).drop(columns='name')

In [31]:
# essentially makeing dummy variables for non-numerical data

# initialize some lists to check
wburg = ['B', 'G', 'R', 'U', 'W']

card_types = ['Creature', 'Instant', 'Enchantment', 'Sorcery', 'Artifact', 'Land', 'Planeswalker', 'Tribal']

subtypes = list(set(" ".join(df['sub_type'].value_counts().index).split()))
subtypes.remove('NONE')

super_types = list(set(" ".join(df['super_type'].value_counts().index).split()))
super_types.remove('NONE')

formats = list(set(" ".join(df['legalities'].value_counts().index).split()))
formats.remove('NONE')

rarities = ['common', 'uncommon', 'rare', 'mythic']


# timer and progress checker
t0 = time.time()
counter = 0

for counter, index in enumerate(converted_df.index):
    # colors
    for color in wburg:
        converted_df.loc[index, 'colors_' + color] = (color in df.loc[index, 'colors']) * 1
        converted_df.loc[index, 'color_identity_' + color] = (color in df.loc[index, 'color_identity']) * 1
    
    # card_types
    for ctype in card_types:
        converted_df.loc[index, 'card_type_' + ctype] = (ctype in df.loc[index, 'card_type']) * 1
        
    # sub_types
    for stype in subtypes:
        converted_df.loc[index, 'sub_type_' + stype] = (stype in df.loc[index, 'sub_type']) * 1
        
    # super_type
    for st in super_types:
        converted_df.loc[index, 'super_type_' + st] = (st in df.loc[index, 'super_type']) * 1
    
    # legalities
    for form in formats:
        converted_df.loc[index, 'legalities_' + form] = (form in df.loc[index, 'legalities']) * 1
    
    # rarity
    for r in rarities:
        converted_df.loc[index, 'rarity_' + r] = (r in df.loc[index, 'rarity']) * 1
    
    # progress checker
    if counter % 1000 == 0:
        print(f'Converted {counter} cards out of a total of {len(converted_df.index)}')
        print(f'mins: {(time.time() - t0)/60}')
        print('-------------------------------')

Converted 0 cards out of a total of 18108
mins: 0.2760611653327942
-------------------------------
Converted 1000 cards out of a total of 18108
mins: 4.330556384722391
-------------------------------
Converted 2000 cards out of a total of 18108
mins: 8.324195615450542
-------------------------------
Converted 3000 cards out of a total of 18108
mins: 12.301245331764221
-------------------------------
Converted 4000 cards out of a total of 18108
mins: 16.28107301791509
-------------------------------
Converted 5000 cards out of a total of 18108
mins: 20.297400669256845
-------------------------------
Converted 6000 cards out of a total of 18108
mins: 27.443024917443594
-------------------------------
Converted 7000 cards out of a total of 18108
mins: 31.51181941429774
-------------------------------
Converted 8000 cards out of a total of 18108
mins: 35.5830883026123
-------------------------------
Converted 9000 cards out of a total of 18108
mins: 39.655466198921204
---------------------

In [32]:
# convert 'NONE's to np.NaN's so we can scale our data then impute all the NaNs
converted_df = converted_df.replace('NONE', np.NaN)

# convert those columns to numerical data
converted_df['power_back'] = converted_df['power_back'].astype(float)
converted_df['power'] = converted_df['power'].astype(float)
converted_df['toughness'] = converted_df['toughness'].astype(float)
converted_df['toughness_back'] = converted_df['toughness_back'].astype(float)
converted_df['loyalty'] = converted_df['loyalty'].astype(float)
converted_df['loyalty_back'] = converted_df['loyalty_back'].astype(float)

Now that we have everything coverted to numerical data, we should scale the non-binarized features so everything will be weighted equally. This means we should just scale the columns that have a range of values, e.g. power, toughness, loyalty, and cmc.

In [33]:
range_col_list = ['power', 'power_back', 'toughness', 'toughness_back', 'loyalty', 'loyalty_back', 'cmc']
converted_df[range_col_list]

Unnamed: 0_level_0,power,power_back,toughness,toughness_back,loyalty,loyalty_back,cmc
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Static Orb,,,,,,,3.0
Sensory Deprivation,,,,,,,1.0
Road of Return,,,,,,,2.0
Storm Crow,1.0,,2.0,,,,2.0
Walking Sponge,1.0,,1.0,,,,2.0
...,...,...,...,...,...,...,...
Devoted Hero,1.0,,2.0,,,,1.0
Without Weakness,,,,,,,2.0
Firesong and Sunspeaker,4.0,,6.0,,,,6.0
"Samut, the Tested",,,,,4.0,,4.0


In [34]:
ss = StandardScaler()
range_col_sc = ss.fit_transform(converted_df[range_col_list])

In [35]:
sc_df = pd.DataFrame(range_col_sc, index = df.index.values, columns=range_col_list)

In [36]:
sc_df.head()

Unnamed: 0,power,power_back,toughness,toughness_back,loyalty,loyalty_back,cmc
Static Orb,,,,,,,-0.171137
Sensory Deprivation,,,,,,,-1.299672
Road of Return,,,,,,,-0.735404
Storm Crow,-0.901288,,-0.414735,,,,-0.735404
Walking Sponge,-0.901288,,-0.999139,,,,-0.735404


In [37]:
sc_df = sc_df.fillna(0)

In [38]:
sc_df.head()

Unnamed: 0,power,power_back,toughness,toughness_back,loyalty,loyalty_back,cmc
Static Orb,0.0,0.0,0.0,0.0,0.0,0.0,-0.171137
Sensory Deprivation,0.0,0.0,0.0,0.0,0.0,0.0,-1.299672
Road of Return,0.0,0.0,0.0,0.0,0.0,0.0,-0.735404
Storm Crow,-0.901288,0.0,-0.414735,0.0,0.0,0.0,-0.735404
Walking Sponge,-0.901288,0.0,-0.999139,0.0,0.0,0.0,-0.735404


In [39]:
converted_df = pd.concat([converted_df.fillna(0), sc_df], axis=1)

In [40]:
# convert to a sparse matrix
sparse_df = sparse.csr_matrix(converted_df)

In [44]:
converted_df.isnull().sum().sum()

88773

In [41]:
t0 = time.time()
# build the recommender system using cosine similarity
rec = pairwise_distances(sparse_df, metric='cosine')
print((time.time() - t0)/60)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# turn into a dataframe for interpretability
rec_df = pd.DataFrame(rec, index=converted_df.index, columns=converted_df.index)
rec_df.head()

In [None]:
# now to test the recommender system
rec_df['Shock'].sort_values()[1:11]

In [None]:
rec_df['Lightning Bolt'].sort_values()[1:11]

In [None]:
rec_df['Static Orb'].sort_values()[1:11]

In [None]:
rec_df['Prized Amalgam'].sort_values()[1:11]

In [None]:
rec_df['Wrath of God'].sort_values()[1:11]

In [None]:
rec_df['Jace, the Mind Sculptor'].sort_values()[1:11]

In [None]:
rec_df['Delver of Secrets // Insectile Aberration'].sort_values()[1:11]

In [None]:
rec_df['Grizzly Bears'].sort_values()[1:11] # vanilla creature