# 03 - Recommender System

In [1]:
# imports
import pandas as pd
import numpy as np

from scipy import sparse # cut down on memory size
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer

pd.options.display.max_columns = 35

___

In [2]:
# read in the data
df = pd.read_csv('../Data/cards_cleaned.csv')
df.head()

Unnamed: 0,name,layout,colors,color_identity,mana_cost,cmc,type_line,card_type,super_type,sub_type,oracle_text,oracle_text_token,legalities,rarity,power,toughness,loyalty,card_faces,activated_ability,triggered_ability,oracle_text_back,oracle_text_back_token,colors_back,power_back,toughness_back,loyalty_back,card_type_back,super_type_back,sub_type_back,mana_cost_back
0,Static Orb,normal,[],[],{3},3.0,Artifact,Artifact,NONE,NONE,"As long as Static Orb is untapped, players can...",as long as static orb is untapped players can'...,legacy vintage commander duel,rare,NONE,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
1,Sensory Deprivation,normal,['U'],['U'],{U},1.0,Enchantment — Aura,Enchantment,NONE,Aura,Enchant creature Enchanted creature gets -3/-0.,enchant creature enchanted creature gets -3/-0,pioneer modern legacy pauper vintage penny com...,common,NONE,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
2,Road of Return,normal,['G'],['G'],{G}{G},2.0,Sorcery,Sorcery,NONE,NONE,Choose one — • Return target permanent card fr...,choose one return target permanent card from y...,legacy vintage commander duel,rare,NONE,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
3,Storm Crow,normal,['U'],['U'],{1}{U},2.0,Creature — Bird,Creature,NONE,Bird,Flying (This creature can't be blocked except ...,flying this creature can't be blocked except b...,modern legacy pauper vintage penny commander duel,common,1,2,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
4,Walking Sponge,normal,['U'],['U'],{1}{U},2.0,Creature — Sponge,Creature,NONE,Sponge,{T}: Target creature loses your choice of flyi...,{t} target creature loses your choice of flyin...,legacy vintage commander duel,uncommon,1,1,NONE,NONE,1.0,0.0,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE


In [3]:
df.shape

(18108, 30)

___

## Oracle text recommender system
To start I want to build a recommender system that just looks at oracle text. In order to do that I need to combine both oracle_text and oracle_text_back into one single column

In [4]:
df['oracle_combined'] = df['oracle_text_token'] + " " + df['oracle_text_back_token']

In [5]:
# start by isolating the name of the card and it's oracle_text
oracle = df['oracle_combined']

# vectorize all our words
cvec = CountVectorizer(stop_words=['none'],
                      min_df=2,
                      max_df=.98,
                      ngram_range=(2,5),
                      token_pattern="[a-zA-Z{}+'0-9-/−]+") # we should use the same RegEx to keep certain characters together 

oracle_vec = cvec.fit_transform(oracle)

# convert to a dataframe so we can use this later on as well
converted_df = pd.DataFrame(oracle_vec.toarray(), columns=cvec.get_feature_names(), index=df['name'])

In [6]:
converted_df.head()

Unnamed: 0_level_0,'{t} this,+0/+1 and,+0/+1 and has,+0/+1 counter,+0/+1 counter on,+0/+1 counters,+0/+1 counters on,+0/+1 for,+0/+1 for each,+0/+1 until,+0/+1 until end,+0/+1 until end of,+0/+1 until end of turn,+0/+1 whenever,+0/+1 whenever a,+0/+2 and,+0/+2 and assigns,...,−8 target opponent gets,−8 target opponent gets an,−8 you,−8 you get,−8 you get an,−8 you get an emblem,−9 gain,−9 gain control,−9 gain control of,−9 put,−9 you,−9 you get,−9 you get an,−9 you get an emblem,−x chandra,−x return,−x return target
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
Static Orb,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Sensory Deprivation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Road of Return,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Storm Crow,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Walking Sponge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


For the recommender system to work efficiently we need to convert the data back into a sparse matrix with the new names as the index

In [7]:
sparse_df = sparse.csr_matrix(converted_df)

In [8]:
# build the recommender system using cosine similarity
rec = pairwise_distances(sparse_df, metric='cosine')

In [9]:
rec.shape

(18108, 18108)

In [10]:
rec_df = pd.DataFrame(rec, index=converted_df.index, columns=converted_df.index)
rec_df.head()

name,Static Orb,Sensory Deprivation,Road of Return,Storm Crow,Walking Sponge,Ravnica at War,Torrent of Fire,Pteramander,Nantuko Elder,Vedalken Heretic,Waterknot,Ruthless Knave,"Hua Tuo, Honored Physician",Veil of Summer,Disposal Mummy,Marang River Prowler,Aura Graft,...,Trinket Mage,Skyshroud Blessing,"Omnath, Locus of the Roil",Harvest Hand // Scrounged Scythe,Polis Crusher,Test of Endurance,Venom Sliver,Borderland Ranger,Curse of Thirst,Temporary Truce,Clearwater Goblet,Quarry Beetle,Devoted Hero,Without Weakness,Firesong and Sunspeaker,"Samut, the Tested",Sinew Sliver
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
Static Orb,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.947247,1.0,...,1.0,1.0,1.0,0.963114,1.0,1.0,1.0,1.0,1.0,0.984595,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Sensory Deprivation,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.819813,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.976998,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Road of Return,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.991151,1.0,1.0,1.0,1.0,0.935876,1.0,0.982848,0.963614,1.0,...,0.957119,1.0,0.992186,1.0,1.0,0.985146,1.0,0.951921,1.0,1.0,1.0,0.856439,1.0,1.0,1.0,1.0,1.0
Storm Crow,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.986143,1.0,1.0,1.0,1.0,1.0,0.977249,1.0,0.94302,1.0,...,1.0,0.979562,1.0,0.867197,0.984492,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Walking Sponge,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.983648,0.933317,1.0,1.0,1.0,...,1.0,0.880192,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.914405,0.980698,0.913836,1.0


In [11]:
# now to test the recommender system
rec_df['Shock'].sort_values()[1:11]

name
Tarfire              0.118083
Unyaro Bee Sting     0.147987
Moonglove Extract    0.222222
Goblin Test Pilot    0.242967
Ember Hauler         0.242967
Deadapult            0.242967
Crackling Triton     0.262135
Tar Pitcher          0.262135
Orcish Vandal        0.262135
Magma Jet            0.262135
Name: Shock, dtype: float64

In [12]:
rec_df['Static Orb'].sort_values()[1:11]

name
Winter Orb        0.339828
Stoic Angel       0.533431
Damping Field     0.608003
Imi Statue        0.608003
Dovin Baan        0.636490
Mungha Wurm       0.828571
Storage Matrix    0.845697
Castle Raptors    0.847279
Stabilizer        0.857143
Giant Tortoise    0.870781
Name: Static Orb, dtype: float64

In [13]:
rec_df['Prized Amalgam'].sort_values()[1:11]

name
Bone Dragon               0.463599
Footsteps of the Goryo    0.527900
Reassembling Skeleton     0.528050
Despoiler of Souls        0.538991
Scrapheap Scrounger       0.544383
Drownyard Temple          0.547321
Ghoulsteed                0.547321
Advanced Stitchwing       0.564410
Apprentice Necromancer    0.589638
Wake the Dead             0.598464
Name: Prized Amalgam, dtype: float64

In [14]:
rec_df['Wrath of God'].sort_values()[1:11]

name
Damnation          0.000000
Perish             0.351819
Winds of Rath      0.469670
Kirtar's Wrath     0.498035
Planar Collapse    0.500000
Plague Wind        0.537090
March of Souls     0.539821
Decree of Pain     0.585977
Day of Judgment    0.591752
Rout               0.611078
Name: Wrath of God, dtype: float64

This is a great start! Now I want to add the numerical features and see the results
___

## Adding numerical data to our features

In [15]:
df.dtypes

name                       object
layout                     object
colors                     object
color_identity             object
mana_cost                  object
cmc                       float64
type_line                  object
card_type                  object
super_type                 object
sub_type                   object
oracle_text                object
oracle_text_token          object
legalities                 object
rarity                     object
power                      object
toughness                  object
loyalty                    object
card_faces                 object
activated_ability         float64
triggered_ability         float64
oracle_text_back           object
oracle_text_back_token     object
colors_back                object
power_back                 object
toughness_back             object
loyalty_back               object
card_type_back             object
super_type_back            object
sub_type_back              object
mana_cost_back

In [16]:
# we already have a df for the oracle text so we can just use that one and add to it.
converted_df['cmc'] = df['cmc'].values
converted_df['activated_ability'] = df['activated_ability'].values
converted_df['triggered_ability'] = df['triggered_ability'].values

# convert to a sparse matrix
sparse_df = sparse.csr_matrix(converted_df)

# build the recommender system using cosine similarity
rec = pairwise_distances(sparse_df, metric='cosine')

# turn into a dataframe for interpretability
rec_df = pd.DataFrame(rec, index=converted_df.index, columns=converted_df.index)
rec_df.head()

name,Static Orb,Sensory Deprivation,Road of Return,Storm Crow,Walking Sponge,Ravnica at War,Torrent of Fire,Pteramander,Nantuko Elder,Vedalken Heretic,Waterknot,Ruthless Knave,"Hua Tuo, Honored Physician",Veil of Summer,Disposal Mummy,Marang River Prowler,Aura Graft,...,Trinket Mage,Skyshroud Blessing,"Omnath, Locus of the Roil",Harvest Hand // Scrounged Scythe,Polis Crusher,Test of Endurance,Venom Sliver,Borderland Ranger,Curse of Thirst,Temporary Truce,Clearwater Goblet,Quarry Beetle,Devoted Hero,Without Weakness,Firesong and Sunspeaker,"Samut, the Tested",Sinew Sliver
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
Static Orb,0.0,0.89829,0.923837,0.88384,0.887452,0.628609,0.785099,0.964907,0.672239,0.873845,0.853421,0.874734,0.878754,0.971038,0.819784,0.818056,0.856161,...,0.900829,0.899128,0.881229,0.869147,0.854329,0.798255,0.916016,0.889801,0.798979,0.903114,0.844774,0.783809,0.606081,0.937124,0.761249,0.87807,0.789441
Sensory Deprivation,0.89829,0.0,0.950078,0.923861,0.926229,0.756568,0.859141,0.976998,0.785166,0.91731,0.743795,0.917893,0.920528,0.981017,0.881875,0.910557,0.905719,...,0.934997,0.933882,0.92215,0.914231,0.904518,0.867764,0.944952,0.927768,0.868238,0.945567,0.898255,0.858295,0.741801,0.958787,0.843508,0.92008,0.861987
Road of Return,0.923837,0.950078,0.0,0.942985,0.944758,0.81771,0.89452,0.974163,0.839125,0.938079,0.928055,0.938516,0.880978,0.985785,0.896802,0.899534,0.9294,...,0.910761,0.950489,0.934417,0.95183,0.9285,0.8886,0.958778,0.900836,0.901333,0.959239,0.92381,0.777162,0.806653,0.969138,0.882814,0.940153,0.896652
Storm Crow,0.88384,0.923861,0.942985,0.0,0.915747,0.721981,0.839128,0.960594,0.754642,0.905562,0.890272,0.906227,0.909237,0.956639,0.865092,0.846774,0.892324,...,0.925762,0.90561,0.911089,0.80409,0.877321,0.848976,0.937131,0.917506,0.849518,0.937833,0.883799,0.838161,0.705116,0.952931,0.821273,0.908725,0.842378
Walking Sponge,0.887452,0.926229,0.944758,0.915747,0.0,0.730626,0.84413,0.96182,0.72265,0.908498,0.893685,0.894,0.882745,0.915975,0.869287,0.901026,0.895672,...,0.92807,0.81709,0.913854,0.928818,0.881136,0.853672,0.939086,0.920071,0.854197,0.939766,0.887413,0.843194,0.714286,0.863185,0.8124,0.823126,0.847279


In [17]:
# now to test the recommender system
rec_df['Shock'].sort_values()[1:11]

name
Tarfire                0.111477
Magma Jet              0.250731
Ember Hauler           0.250731
Unyaro Bee Sting       0.258323
Moonglove Extract      0.262957
Orcish Vandal          0.265870
Deadapult              0.275776
Explosive Apparatus    0.282453
Goblin Test Pilot      0.287948
Crackling Triton       0.287948
Name: Shock, dtype: float64

In [18]:
rec_df['Static Orb'].sort_values()[1:11]

name
Winter Orb            0.305952
Stoic Angel           0.418682
Damping Field         0.499399
Imi Statue            0.499399
Dovin Baan            0.558457
Stabilizer            0.588946
Castle Raptors        0.600509
Armored Wolf-Rider    0.606081
Royal Falcon          0.606081
Alpha Myr             0.606081
Name: Static Orb, dtype: float64

In [19]:
rec_df['Prized Amalgam'].sort_values()[1:11]

name
Bone Dragon               0.438616
Footsteps of the Goryo    0.477674
Reassembling Skeleton     0.490087
Despoiler of Souls        0.504859
Scrapheap Scrounger       0.510044
Ghoulsteed                0.512485
Advanced Stitchwing       0.521772
Stitchwing Skaab          0.539938
Splendid Reclamation      0.544561
Chronosavant              0.549436
Name: Prized Amalgam, dtype: float64

In [20]:
rec_df['Wrath of God'].sort_values()[1:11]

name
Damnation              1.110223e-16
Perish                 2.111065e-01
Winds of Rath          2.232762e-01
Plague Wind            2.358599e-01
Day of Judgment        2.524550e-01
Extinguish All Hope    2.951452e-01
Soulscour              3.071714e-01
Plague Spores          3.124761e-01
Plasma Elemental       3.134562e-01
Final Judgment         3.134562e-01
Name: Wrath of God, dtype: float64

In [21]:
rec_df['Jace, the Mind Sculptor'].sort_values()[1:11]

name
Brainstorm                0.450524
Riverwise Augur           0.454100
Coral Fighters            0.461954
Dream Cache               0.524370
Eye Spy                   0.525424
Voyage's End              0.528485
Cavalier of Gales         0.530216
Precognition              0.555188
Stormcaller of Keranos    0.568001
Watchful Automaton        0.568001
Name: Jace, the Mind Sculptor, dtype: float64

___

## Non-numerical data
Now I have to convert the non-numerical data into numerical data

In [22]:
df.dtypes

name                       object
layout                     object
colors                     object
color_identity             object
mana_cost                  object
cmc                       float64
type_line                  object
card_type                  object
super_type                 object
sub_type                   object
oracle_text                object
oracle_text_token          object
legalities                 object
rarity                     object
power                      object
toughness                  object
loyalty                    object
card_faces                 object
activated_ability         float64
triggered_ability         float64
oracle_text_back           object
oracle_text_back_token     object
colors_back                object
power_back                 object
toughness_back             object
loyalty_back               object
card_type_back             object
super_type_back            object
sub_type_back              object
mana_cost_back

### colors

In [23]:
df = df.set_index(df['name']).drop(columns='name')

In [24]:
# essentially makeing dummy variables for colors and color identity
# I'm going to repeat this method for other columns as well
wburg = ['B', 'G', 'R', 'U', 'W']
for index in converted_df.index:
    for color in range(len(wburg)):
        converted_df.loc[index, 'colors_' + wburg[color]] = (wburg[color] in df.loc[index, 'colors']) * 1
        converted_df.loc[index, 'color_identity_' + wburg[color]] = (wburg[color] in df.loc[index, 'color_identity']) * 1

### card type

In [25]:
card_types = ['Creature', 'Instant', 'Enchantment', 'Sorcery', 'Artifact', 'Land', 'Planeswalker', 'Tribal']
for index in converted_df.index:
    for ctype in range(len(card_types)):
        converted_df.loc[index, 'card_type_' + card_types[ctype]] = (card_types[ctype] in df.loc[index, 'card_type']) * 1

### subtype

In [27]:
df['sub_type'].value_counts()

NONE                     6786
Aura                      840
Human Wizard              364
Spirit                    309
Human Soldier             300
                         ... 
Cephalid Wizard Scout       1
Zombie Satyr                1
Oyster                      1
Satyr Beast                 1
Frog Mutant                 1
Name: sub_type, Length: 1193, dtype: int64

### super_type

In [28]:
df['super_type'].value_counts()

NONE              16915
Legendary          1112
Snow                 57
World                16
Basic                 6
Legendary Snow        2
Name: super_type, dtype: int64

### legalities

In [29]:
df['legalities'].value_counts()

modern legacy vintage penny commander duel                                3040
modern legacy pauper vintage penny commander duel                         2309
legacy vintage commander duel                                             2168
pioneer modern legacy vintage penny commander duel                        1820
pioneer modern legacy pauper vintage penny commander duel                 1449
                                                                          ... 
pioneer modern legacy commander duel                                         1
historic modern legacy vintage penny commander duel                          1
standard future historic pioneer modern legacy vintage commander brawl       1
pioneer modern legacy vintage duel                                           1
legacy commander duel                                                        1
Name: legalities, Length: 66, dtype: int64

### rarity

In [30]:
df['rarity'].value_counts()

common      6566
uncommon    5631
rare        5114
mythic       797
Name: rarity, dtype: int64

### power

In [31]:
df['power'].value_counts()

NONE    8357
2       3038
1       2140
3       1796
4       1019
5        603
0        457
6        331
7        123
*        119
8         56
10        23
9         23
12         7
11         5
13         3
1+*        3
15         2
-1         2
16         1
Name: power, dtype: int64

### toughness

In [32]:
df['toughness'].value_counts()

NONE    8357
2       2557
1       2333
3       1964
4       1311
5        691
6        366
0        151
7        138
*        103
8         62
9         24
10        22
11         7
12         7
13         5
1+*        5
15         2
16         1
14         1
-1         1
Name: toughness, dtype: int64

### loyalty

In [33]:
df['loyalty'].value_counts()

NONE    17919
5          64
4          62
3          42
6           9
7           7
2           4
X           1
Name: loyalty, dtype: int64