In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

# Vanilla WoW ninjaloot predictor

## What is ninjalooting?
In an MMORPG, the act of looting and/or attempting to loot items (usually of significant rarity and value) without the permission of the group/raid.
## Abstract
This project attempts to predict if an item in World of Warcraft (Patch 1.7) is worth ninjalooting. Our data contains over 14000 items from the game's database with 128 attributes each. The data doesn't contain labels that say if an item is ninjalooted or not. We'll have to add that label ourselves.

#### We will:
* todo

In [3]:
# Loading the data

In [4]:
items = pd.read_csv('data/vanilla_wow_items.csv', sep = ";")

In [5]:
items.head()

Unnamed: 0,entry,class,subclass,name,displayid,Quality,Flags,BuyCount,BuyPrice,SellPrice,...,area,Map,BagFamily,ScriptName,DisenchantID,FoodType,minMoneyLoot,maxMoneyLoot,Duration,ExtraFlags
0,25,2,7,'Worn Shortsword',1542,1,0,1,35,7,...,0,0,0,'',0,0,0,0,0,0
1,35,2,10,'Bent Staff',472,1,0,1,47,9,...,0,0,0,'',0,0,0,0,0,0
2,36,2,4,'Worn Mace',5194,1,0,1,38,7,...,0,0,0,'',0,0,0,0,0,0
3,37,2,0,'Worn Axe',14029,1,0,1,38,7,...,0,0,0,'',0,0,0,0,0,0
4,38,4,0,'Recruit\'s Shirt',9891,1,0,1,1,1,...,0,0,0,'',0,0,0,0,0,0


In [6]:
items.shape

(14402, 128)

In [7]:
# This dataframe doesn't have labels so we need to create our own based on common knowledge and personal experience...

In [8]:
most_commonly_ninjad = ["'Shadowfang'","'Assassin\'s Blade'", "'Corpsemaker'", "'The Rockpounder'", "'Blackstone Ring'",
                        "'Hand of Justice'", "'Ironfoe'", "'Savage Gladiator Chain'", "'Cape of the Black Baron'",
                        "'Deathcharger\'s Reins'", "'Recipe: Flask of Distilled Wisdom'", "'Recipe: Flask of the Titans'",
                        "'Recipe: Flask of Supreme Power'", "'Righteous Orb'", "'Pattern: Truefaith Vestments'",
                        "'Tarnished Elven Ring'", "'Rod of the Ogre Magi'", "'Foror\'s Compendium of Dragon Slaying'",
                        "'Dark Rune'", "'Pattern: Robe of the Void'", "'Felstriker'", "'Dal\'Rend\'s Sacred Charge'",
                        "'Dal\'Rend\'s Tribal Guardian'", "'Truestrike Shoulders'", "'Briarwood Reed'",
                        "'Pristine Hide of the Beast'", "'Ace of Beasts'", "'Alcor\'s Sunrazor'", "'Edgemaster\'s Handguards'",
                        "'Freezing Band'", "'Krol Blade'", "'Stockade Pauldrons'", "'Pristine Black Diamond'"]
items['ninjalooted'] = np.where(items['name'].isin(most_commonly_ninjad), 1, 0)

In [9]:
# We see that almost all items are not worth ninjalooting
items['ninjalooted'].value_counts(normalize=True) * 100

0    99.81947
1     0.18053
Name: ninjalooted, dtype: float64

In [10]:
items_attributes = items.drop('ninjalooted', axis = 1)
items_labels = items.ninjalooted

In [11]:
# Exploring the data
# While making the csv file out of an sql, I noted a couple of attributes that might be interesting

In [12]:
interesting_attributes = items_attributes[['name', 'class', "subclass", "Quality", "BuyPrice", "SellPrice", "InventoryType",
                                           "ItemLevel", "RequiredLevel", "RequiredSkill",
                                           "dmg_min1", "dmg_max1", "bonding", "MaxDurability"]]

In [13]:
interesting_attributes.loc[interesting_attributes['name'].isin(most_commonly_ninjad)]

Unnamed: 0,name,class,subclass,Quality,BuyPrice,SellPrice,InventoryType,ItemLevel,RequiredLevel,RequiredSkill,dmg_min1,dmg_max1,bonding,MaxDurability
217,'Freezing Band',4,0,4,18000,4500,11,52,47,0,0.0,0.0,2,0
444,'Shadowfang',2,7,3,14822,2964,21,24,19,0,29.0,55.0,2,75
875,'Krol Blade',2,7,4,259289,51857,21,56,51,0,80.0,149.0,2,105
4169,'Corpsemaker',2,1,3,49652,9930,17,34,29,0,88.0,132.0,1,100
5458,'The Rockpounder',2,5,3,161244,32248,17,49,44,0,126.0,190.0,1,100
6893,'Ironfoe',2,4,4,315430,63086,21,60,55,0,73.0,136.0,1,105
6902,'Savage Gladiator Chain',4,3,4,167666,33533,5,57,52,0,0.0,0.0,1,140
6951,'Hand of Justice',4,0,3,40000,10000,12,58,53,0,0.0,0.0,1,0
7407,'Felstriker',2,15,4,378124,75624,13,63,58,0,54.0,101.0,1,75
7508,'Pristine Hide of the Beast',15,0,4,0,0,0,1,0,0,0.0,0.0,0,0


In [14]:
interesting_attributes.loc[interesting_attributes['name'].isin(most_commonly_ninjad)].describe()

Unnamed: 0,class,subclass,Quality,BuyPrice,SellPrice,InventoryType,ItemLevel,RequiredLevel,RequiredSkill,dmg_min1,dmg_max1,bonding,MaxDurability
count,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0
mean,5.923077,3.115385,3.153846,100722.923077,21413.5,8.115385,49.653846,30.384615,34.884615,21.769231,36.076923,0.769231,36.923077
std,4.184955,3.787632,0.7317,108823.300966,21403.610957,7.99163,20.087692,25.90147,73.1302,40.120127,64.569914,0.710363,49.418153
min,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.5,0.0,3.0,40000.0,10000.0,0.0,52.5,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,2.0,3.0,59282.5,14820.5,8.0,59.0,45.5,0.0,0.0,0.0,1.0,0.0
75%,9.0,5.75,4.0,117956.25,28505.0,15.25,60.75,54.5,0.0,21.75,41.25,1.0,93.75
max,15.0,15.0,4.0,378124.0,75624.0,21.0,63.0,58.0,197.0,126.0,190.0,2.0,140.0


In [15]:
interesting_attributes.describe()

Unnamed: 0,class,subclass,Quality,BuyPrice,SellPrice,InventoryType,ItemLevel,RequiredLevel,RequiredSkill,dmg_min1,dmg_max1,bonding,MaxDurability
count,14402.0,14402.0,14402.0,14402.0,14402.0,14402.0,14402.0,14402.0,14402.0,14402.0,14402.0,14402.0,14402.0
mean,5.960978,2.161505,1.929524,59919.18,12067.6,6.622136,34.805305,22.333009,16.905221,6.283811,10.50989,1.405916,29.772184
std,4.225339,3.199676,1.059492,244103.0,42305.23,7.150818,25.12265,23.938199,76.410063,23.227454,37.445977,1.232305,35.220783
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.0,1.0,60.0,9.0,0.0,10.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,4.0,1.0,2.0,7364.5,1439.0,5.0,36.0,13.0,0.0,0.0,0.0,1.0,16.0
75%,9.0,3.0,3.0,51389.0,10476.75,11.0,58.0,47.0,0.0,0.0,0.0,2.0,55.0
max,15.0,20.0,6.0,10000000.0,1632328.0,28.0,100.0,60.0,762.0,289.0,435.0,5.0,165.0


In [16]:
# From the descriptions we can see our sought after items have a 
# higher mean Quality, ItemLevel, min/max damage and buy/sell price.
# I think the models we will train have to take those attributes into consideration

In [17]:
# Preparing the data for training

In [18]:
interesting_attributes = interesting_attributes.drop('name', axis = 1)
really_interesting_attributes = interesting_attributes.drop(['InventoryType', 'MaxDurability'], axis = 1)

In [19]:
scaler = MinMaxScaler(feature_range = (0, 1))
interesting_attributes = scaler.fit_transform(interesting_attributes)
really_interesting_attributes = scaler.fit_transform(really_interesting_attributes)

In [None]:
# We should test our models on all the data first, before removing attributes

In [41]:
item_attributes_train, item_attributes_test, item_labels_train, item_labels_test = train_test_split(
    pd.get_dummies(items_attributes), items_labels, train_size = 0.7, test_size = 0.3, stratify = items_labels)

In [42]:
k_fold = StratifiedKFold(n_splits = 5)
k_fold = k_fold.get_n_splits(item_attributes_train, item_labels_train)

In [43]:
# Training the models

In [44]:
# Decision Tree

In [45]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(item_attributes_train, item_labels_train)
decision_tree_baseline_score = decision_tree_model.score(item_attributes_test, item_labels_test)
decision_tree_baseline_score

0.9981485767183522

In [46]:
# The scoring method for our baseline decision tree is useless due to the data being highly imbalanced

In [49]:
item_labels_pred = decision_tree_model.predict(item_attributes_test)
decision_tree_f1_score = f1_score(item_labels_pred, item_labels_test)
decision_tree_f1_score

  'recall', 'true', average, warn_for)


0.0

In [82]:
# Random Forest

In [50]:
random_forest_model = RandomForestClassifier()
random_forest_model.fit(item_attributes_train, item_labels_train)
random_forest_baseline_score = random_forest_model.score(item_attributes_test, item_labels_test)
random_forest_f1_train_score = f1_score(random_forest_model.predict(item_attributes_train), item_labels_train)
print(random_forest_baseline_score)
print(random_forest_f1_train_score)

0.9979171488081463
0.9714285714285714


In [51]:
item_labels_pred = random_forest_model.predict(item_attributes_test)
random_forest_f1_score = f1_score(item_labels_pred, item_labels_test)
random_forest_f1_score

0.0

In [85]:
# Both trees fair equally poorly

In [86]:
# AdaBoost

In [87]:
tree = DecisionTreeClassifier()
ada = AdaBoostClassifier(base_estimator = tree, n_estimators = 100, learning_rate = 0.1)
ada.fit(item_attributes_train, item_labels_train)
train_pred = accuracy_score(item_labels_train, ada.predict(item_attributes_train))
test_pred = accuracy_score(item_labels_test, ada.predict(item_attributes_test))
item_labels_pred = ada.predict(item_attributes_test)
adaBoost_f1_score = f1_score(item_labels_pred, item_labels_test)
print(train_pred, test_pred, adaBoost_f1_score)

0.9997024104751513 0.9974542929877344 0.0


In [88]:
# Unfortunately none of the classifiers show good results.
# I have tried the regression and svm algorithms but they have yielded simular results.
# In conclusion the phrase garbage in, garbage out cannot be more true. I'm very sorry.

In [89]:
logistic_model = LogisticRegression(C = 1e6)
logistic_model.fit(item_attributes_train, item_labels_train)

LogisticRegression(C=1000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [90]:
logistic_model.score(item_attributes_test, item_labels_test)

0.9981485767183522