<a href="https://colab.research.google.com/github/11shoe/mtg_cards/blob/main/Unit_2_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install category_encoders==2.*

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

  import pandas.util.testing as tm


In [3]:
url = 'https://raw.githubusercontent.com/11shoe/mtg_cards/main/cards.csv'
df = pd.read_csv(url,
                 index_col = 'name')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def wrangle(df):

  # removing cards that are reprints
  df = df[df['isReprint'] != 1]

  # getting rid of what are considered "joke" sets
  df = df[df['setCode'] != 'UST']
  df = df[df['setCode'] != 'UND']
  df = df[df['setCode'] != 'UNH']
  df = df[df['setCode'] != 'UGL']

  # I do not want to include any "special" or "bonus" rarity in the data because 
  # those are not normal rarities
  df = df[(df['rarity']!='special') & (df['rarity']!='bonus')]

  # The NaN values in the target column are actual colorless cards
  df['colorIdentity'].fillna('C', inplace=True)


  # I am narrowing the data to only those cards that have a specific color identity
  # instead of a mixure of colors or no color identity
  df = df[(df['colorIdentity'] == 'W') | (df['colorIdentity'] == 'U') | (df['colorIdentity'] == 'B') | (df['colorIdentity'] == 'R') | (df['colorIdentity'] == 'G') | (df['colorIdentity'] == 'C')]

  # I just want to look at creatures for this project
  df= df[df['types'].str.contains('Creature')]

  # The features that will be most important from this data set
  df = df[['types', 'subtypes','rarity','power', 'toughness', 'keywords', 'manaValue', 'colorIdentity']]



  return df

In [5]:
df = wrangle(df)

In [6]:
df

Unnamed: 0_level_0,types,subtypes,rarity,power,toughness,keywords,manaValue,colorIdentity
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Auriok Champion,Creature,"Human,Cleric",rare,1,1,Protection,2.0,W
Auriok Salvagers,Creature,"Human,Soldier",uncommon,2,4,,4.0,W
Auriok Windwalker,Creature,"Human,Wizard",rare,2,3,Flying,4.0,W
Leonin Squire,Creature,"Cat,Soldier",common,2,2,,2.0,W
Loxodon Anchorite,Creature,"Elephant,Cleric",common,2,3,,4.0,W
...,...,...,...,...,...,...,...,...
Oran-Rief Ooze,Creature,Ooze,rare,2,2,,3.0,G
Swarm Shambler,Creature,"Fungus,Beast",rare,0,0,,1.0,G
Tajuru Paragon,Creature,Elf,rare,3,2,Kicker,2.0,G
Myriad Construct,"Artifact,Creature",Construct,rare,4,4,Kicker,4.0,C


In [7]:
# This will be my target. I want to see if I can predict a card's color identity 
# from the information on the card itself
target = 'colorIdentity'
X = df.drop(columns = target)
y = df[target]

In [8]:
# splitting the data into my traing and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [9]:
y_train.value_counts(normalize= True)

G    0.202273
W    0.195730
B    0.191482
R    0.186546
U    0.172196
C    0.051774
Name: colorIdentity, dtype: float64

In [10]:
# 'G' is the most frequently occuring color so we will use that as the baseline
baseline = y_train.value_counts(normalize= True).max()
print('The baseline for this data is:', baseline)

The baseline for this data is: 0.20227298817586958


In [11]:
model_dt = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    DecisionTreeClassifier(random_state=42)
)

In [12]:
model_dt.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['types', 'subtypes', 'rarity', 'power',
                                      'toughness', 'keywords'],
                                mapping=[{'col': 'types',
                                          'data_type': dtype('O'),
                                          'mapping': Creature                1
Artifact,Creature       2
Enchantment,Creature    3
Land,Creature           4
Instant,Creature        5
NaN                    -2
dtype: int64},
                                         {'col': 'subtypes',
                                          'data_type': dtype('O'),
                                          'mapping': Beast                 1
Kithkin               2
Unicorn               3
Human,Wa...
dtype: int64},
                                         {'col': 'keywords',
                                          'data_type': dtype('O'),
                                          'mapping': NaN          

In [13]:
print(accuracy_score(y_test, model_dt.predict(X_test)))

0.5977961432506887


In [14]:
print(classification_report(y_test, model_dt.predict(X_test)))

              precision    recall  f1-score   support

           B       0.54      0.53      0.54       420
           C       0.89      0.89      0.89       118
           G       0.58      0.63      0.60       449
           R       0.54      0.56      0.55       413
           U       0.61      0.57      0.59       375
           W       0.65      0.61      0.63       403

    accuracy                           0.60      2178
   macro avg       0.63      0.63      0.63      2178
weighted avg       0.60      0.60      0.60      2178



In [15]:
model_lr = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    StandardScaler(),
    LogisticRegression(max_iter= 200)
)

In [16]:
model_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['types', 'subtypes', 'rarity', 'power',
                                     'toughness', 'keywords'],
                               use_cat_names=True)),
                ('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=200))])

In [17]:
accuracy_score(y_test, model_lr.predict(X_test))

0.6965105601469238

In [18]:
print(classification_report(y_test, model_lr.predict(X_test)))

              precision    recall  f1-score   support

           B       0.72      0.62      0.67       420
           C       0.87      0.92      0.90       118
           G       0.72      0.72      0.72       449
           R       0.62      0.62      0.62       413
           U       0.69      0.70      0.70       375
           W       0.68      0.76      0.72       403

    accuracy                           0.70      2178
   macro avg       0.72      0.72      0.72      2178
weighted avg       0.70      0.70      0.70      2178

