In [1]:
import os
import pandas as pd

allcards = pd.read_json('AllCards.json')

In [152]:
import numpy as np
import matplotlib.pyplot as plt
import itertools
import math

decklists_path = 'data/decks/'

class ArchetypeWorkshop():
        def __init__(self):
            self.deckfiles = []
            self.loadAllDeckFilenames()
            self.decks = []
            self.loadAllDecks()
            self.cards = []
            self.loadAllUsedCards()
            self.used_cards = []
            self.loadAllUsedCardsData()
            self.allcards =  allcards#[]
            #self.loadAllCardsData()
            self.deck_and_name=[]
            self.df = []
            self.decknames = []
            
            self.all_params = [ 
                'AverageCMC',
                'AverageCMCCreatures',
                'CreatureDensity',
                'PowerToToughness',
                'PowerToCMC',
                'ToughnessToCMC',
                'MaxCMC',
                'AveragePower',
                'AverageToughness',
                'PlaneswalkerCreatureDensity']
            
        def loadAllDeckFilenames(self):
            if len(self.deckfiles) == 0:
                tmp = os.listdir(decklists_path)
                deckfiles = []
                for i in tmp:
                    deckfiles.append(i.replace('~', ''))
                self.deckfiles = list(set(deckfiles))
                
        def loadDecks(self, filename):
            data = pd.read_json('data/decks/'+filename)
            return data['cards']
    
        def loadAllDecks(self):
            self.loadAllDeckFilenames()
            for i in self.deckfiles:
                tmp_decks = self.loadDecks(i)
                for d in tmp_decks:
                    self.decks.append(d)
    
        def loadAllUsedCards(self):
            if len(self.cards) == 0:
                self.loadAllDecks()
                for i in self.decks:
                    for j in i:
                        if "(ORI)" not in j:
                            self.cards.append(j)
                self.cards = sorted(list(set(self.cards)))
                
        def loadAllCardsData(self):
            if os.path.isfile('AllCards.json') == False:
                self.downloadAllCardsData()
            if len(self.allcards) == 0:
                self.allcards = pd.read_json('AllCards.json')
                
        def loadAllUsedCardsData(self):
            self.used_cards = pd.read_json('used_cards.json')
                
        def prepareAllUsedCardsData(self):
            used_cards = []
            for i in self.cards:
                used_cards.append(self.allcards[i])
            self.used_cards = used_cards
            
        def saveUsedCardsAsJson(self):
            df = pd.DataFrame(self.used_cards)
            df.to_json('used_cards.json')
            
        # actual archetype detection methods start here
        
        def loadDecksWithNames(self, filename):
            data = pd.read_json('data/decks/'+filename)
            return data['cards'], data['name']
        
        def loadAllDecksWithNames(self):
            self.loadAllDeckFilenames()
            for i in self.deckfiles:
                tmp_decks, tmp_names = self.loadDecksWithNames(i)
                for i in range(0, len(tmp_decks)):
                    self.deck_and_name.append((tmp_decks[i], tmp_names[i]))
                    
        def loadDeckWithAllInfo(self, filename):
            data = pd.read_json('data/decks/'+filename)
            return data['cards'], data['name'], data['player'], data['draft']
        
        def loadAllDecksWithAll(self):
            self.loadAllDeckFilenames()
            for i in self.deckfiles:
                tmp_decks, tmp_names, tmp_players, tmp_drafts = self.loadDeckWithAllInfo(i)
                for i in range(0, len(tmp_decks)):
                    self.deck_and_name.append([tmp_decks[i], tmp_names[i], tmp_players[i], tmp_drafts[i]])
            self.prepareDecknames()
                    
        def showCardTypes(self, deck):
            for i in deck:
                print self.allcards[i]['types']
                
        def getNonlandsOnly(self, deck):
            nonlands = []
            for i in deck:
                if 'Land' not in self.allcards[i]['types']:
                    nonlands.append(i)
            return nonlands
        
        def getNumberOfCardType(self, deck, cardtype):
            acc = 0
            for i in deck:
                if cardtype in self.allcards[i]['types']:
                    acc+=1
            return acc
        
        def getAverageCMC(self, deck):
            acc = 0
            for i in deck:
                acc+=self.allcards[i]['convertedManaCost']
                
            return round((acc+0.0)/len(deck), 4)
        
        def getAverageCMCForType(self, deck, cardtype):
            acc = 0
            counter = 0
            for i in deck:
                if cardtype in self.allcards[i]['types']:
                    acc+=self.allcards[i]['convertedManaCost']
                    counter+=1
            return round((acc+0.0)/counter, 4)
        
        def getMinMaxMedian(self, deck):
            cmcs = []
            for i in deck:
                cmcs.append(self.allcards[i]['convertedManaCost'])
                
            npcmcs = np.array(cmcs)
            return {
                'min': np.min(npcmcs),
                'max': np.max(npcmcs),
                'median': np.median(npcmcs)
            }
        
        def getAveragePowerToughness(self, deck):
            total_power = 0
            total_tough = 0
            total_cmc=0

            number = self.getNumberOfCardType(deck, 'Creature')
            for i in deck:
                if 'Creature' in self.allcards[i]['types']:
                    if self.allcards[i]['power'] not in ['X', '*', '1+*']:
                        total_power += int(self.allcards[i]['power'])
                    if self.allcards[i]['toughness'] not in ['X', '*', '1+*']:
                        total_tough += int(self.allcards[i]['toughness'])
                    total_cmc+=int(self.allcards[i]['convertedManaCost'])
            return {
                'ave_power': round((total_power+0.0)/number, 4),
                'ave_tough': round((total_tough+0.0)/number, 4),
                'total_power': total_power,
                'total_tough': total_tough,
                'power_cmc': round(total_power/(0.0+total_cmc), 4),
                'tough_cmc': round(total_tough/(0.0+total_cmc), 4)
            }
            
        def examineDeck(self, i, chosen_params):
            deck = self.deck_and_name[i][0]
            nonlands = self.getNonlandsOnly(deck)
            deck_name = self.deck_and_name[i][1]
            player_name = self.deck_and_name[i][2]
            draft_num = self.deck_and_name[i][3]
            tmp = deck_name.split(' ')
            simple_name = tmp[1]
                    
            min_max_median = self.getMinMaxMedian(deck)
            average_power_toughness = self.getAveragePowerToughness(nonlands)
            creature_density = round(self.getNumberOfCardType(nonlands, 'Creature')/(0.0 + len(nonlands)), 4)
            pwcr_density = round((self.getNumberOfCardType(nonlands, 'Planeswalker')+
                                self.getNumberOfCardType(nonlands, 'Creature'))/(0.0 + len(nonlands)), 4)    
            power_to_toughness = round(average_power_toughness['total_power']/(0.0+average_power_toughness['total_tough']), 4)
                     
            param_dict = {'Name': simple_name, 'Player and draft':player_name+'_'+draft_num}
            
            if 'AverageCMC' in chosen_params: 
                param_dict['AverageCMC'] = self.getAverageCMC(nonlands)
            
            if 'AverageCMCCreatures' in chosen_params: 
                param_dict['AverageCMCCreatures'] = self.getAverageCMCForType(deck, 'Creature')
                
            if 'CreatureDensity' in chosen_params: 
                param_dict['CreatureDensity'] = creature_density
                
            if 'PlaneswalkerCreatureDensity' in chosen_params:
                param_dict['PlaneswalkerCreatureDensity'] = pwcr_density 
                
            if 'PowerToToughness' in chosen_params: 
                param_dict['PowerToToughness'] = power_to_toughness
                
            if 'PowerToCMC' in chosen_params: 
                param_dict['PowerToCMC'] = average_power_toughness['power_cmc']
                
            if 'ToughnessToCMC' in chosen_params: 
                param_dict['ToughnessToCMC'] = average_power_toughness['tough_cmc']
                
            if 'MaxCMC' in chosen_params: 
                param_dict['MaxCMC'] = min_max_median['max']
                
            if 'AveragePower' in chosen_params: 
                param_dict['AveragePower'] = average_power_toughness['ave_power']
                
            if 'AverageToughness' in chosen_params: 
                param_dict['AverageToughness'] = average_power_toughness['ave_tough']
        
            return param_dict
        
        def examineAllDecks(self, chosen_params, chosen_tags):
            self.results_all_decks = []
            for i in range(len(self.deck_and_name)):
                self.results_all_decks.append(self.examineDeck(i, chosen_params))
                
            filename = 'relevant_cards.csv'
            my_file = open(filename, 'r')
            my_string = my_file.read()
            my_string = my_string.replace('#', '')
            my_string = my_string.replace('\r', '')
            my_list = my_string.split('\n')
            
            self.cardtag_dict = {}
            for i in my_list:
                tmp = i.split("|")
                self.cardtag_dict[tmp[0]] = tmp[1:]
            
            self.convertAllDecksToTags(chosen_tags)
                    
        def convertResultsToDataFrame(self):
            my_keys = self.results_all_decks[0].keys()
            self.pre_df = {}
            for i in my_keys:
                self.pre_df[i] = []
            
            for i in self.results_all_decks:
                for j in my_keys:
                    self.pre_df[j].append(i[j])
            
            self.df = pd.DataFrame.from_dict(self.pre_df)
            
        def showDistributionForEachParameter(self):
            my_keys = self.results_all_decks[0].keys()
            self.pre_df = {}
            for i in my_keys:
                self.pre_df[i] = []
            
            for i in self.results_all_decks:
                for j in my_keys:
                    self.pre_df[j].append(i[j])
                        
            for i in my_keys:
                if i not in ['Name', 'Player and draft']:
                    print i,'Max: ', max(self.pre_df[i]), 'Min: ', min(self.pre_df[i])
                    hist = self.df[i].hist(bins = 20)
                    plt.style.use('ggplot')
                    plt.show()
                    
        def getAllNumericalKeyPairs(self):
            my_keys = self.results_all_decks[0].keys()
            numeric_keys = []
            for i in my_keys:
                if i not in ['Name', 'Player and draft']:
                    numeric_keys.append(i)
                    
            pairs = []
            for i in range (0, len(numeric_keys)):
                for j in range (i+1, len(numeric_keys)):
                    pairs.append((numeric_keys[i], numeric_keys[j]))
                
            return pairs
        
        def getColorsByArchetypes(self):
            archetype_color_dict = {
                'Aggro': 'red',
                'Control': 'blue',
                'Mid-Range': 'orange',
                'Ramp': 'green'
            }
            colors = []
            for i in self.pre_df['Name']:
                colors.append(archetype_color_dict[i])
            
            return colors   
                    
        def showScatterPlotsForParameterPairs(self):
            key_pairs = self.getAllNumericalKeyPairs()
            colors = self.getColorsByArchetypes()
            
            for i in range(0, len(key_pairs)):          
                x_key = key_pairs[i][0]
                y_key = key_pairs[i][1]
            
                x = self.pre_df[x_key]
                y = self.pre_df[y_key]
          
                plt.scatter(x, y, c=colors, alpha=0.5)
                plt.title('Possible correlation between '+x_key+' and '+y_key)
                plt.xlabel(x_key)
                plt.ylabel(y_key)
                 
                plt.show()
                
        def normalizeSingleValue(self, val, p_min, p_max):
             return round((val - p_min + 0.0) / (p_max - p_min), 4)
                
        def normalizeArray(self, my_array):
            tmp_arr = np.array(my_array)
            p_min = min(tmp_arr) 
            p_max = max(tmp_arr)
            
            normalized_array = []
            for i in my_array:
                normalized_value = self.normalizeSingleValue(i, p_min, p_max)
                normalized_array.append(normalized_value)
            
            return normalized_array
        
        def normalizeDataFrame(self):
            self.normalized_pre_df = {}
            my_keys = self.results_all_decks[0].keys()
            for k in my_keys:
                if k not in ['Name', 'Player and draft']:
                    norm_param = self.normalizeArray(self.pre_df[k])
                    self.normalized_pre_df[k] = norm_param
                else:
                    self.normalized_pre_df[k] = self.pre_df[k]
            
            self.normalized_df = pd.DataFrame.from_dict(self.normalized_pre_df)
            
        def convertNormalizedPreDfToCoordinates(self):
            my_keys = self.results_all_decks[0].keys()
            converted = {}
            for i in range(0, len(self.normalized_pre_df['Name'])):
                tmp_array = [] 
                tmp_key = self.normalized_pre_df['Player and draft'][i]+' '+self.normalized_pre_df['Name'][i]
                for j in my_keys:
                    if j not in ['Player and draft', 'Name']:
                        tmp_array.append(self.normalized_pre_df[j][i])

                converted[tmp_key]=tmp_array
            self.deck_coordinates = converted
            
        def calculateDistanceBetweenTwoDecks(self, co1, co2):
            sum = 0
            for i in range(0, len(co1)):
                sum+=(co1[i]-co2[i])**2
            distance = np.sqrt(sum)
            return distance
            
        def findClosestDeck(self, deck_name, show_option):
            my_keys = self.deck_coordinates.keys()
            my_deck_co = self.deck_coordinates[deck_name]
            min_dist = 100
            close_deck = 'None'
            for i in my_keys:
                if (i != deck_name):
                    tmp_dist = self.calculateDistanceBetweenTwoDecks(my_deck_co, self.deck_coordinates[i])
                    if tmp_dist < min_dist:
                        min_dist = tmp_dist
                        close_deck = i
                        
            tmp1 = deck_name.split(' ')
            tmp2 = close_deck.split(' ')
            if tmp1[1] == tmp2[1]:
                same_archetype = True
            else:
                same_archetype = False
            prepared_info = {
                "name": tmp1[0],
                "original": tmp1[1], 
                "classified":tmp2[1]   
            }
            if show_option:
                print 'Closest deck to '+deck_name+' is : '+close_deck+' with distance = '+str(min_dist)+' Archetype: '+str(same_archetype)
            return(same_archetype, prepared_info)  
            
        def findClosestDeckForEachDeck(self, show_option):
            correct = 0
            wrong = 0
            my_keys = self.deck_coordinates.keys()
            gathered_info = []
            for i in my_keys:
                result, info = self.findClosestDeck(i, show_option)
                gathered_info.append(info)
                if result:
                    correct +=1
                else:
                    wrong +=1
            #print(gathered_info) ---> gahtered info is here
            return round((correct+0.0)/(correct+wrong), 4)*100, gathered_info
        
        def findKNN(self, k, deck_name, show_option):
            my_keys = self.deck_coordinates.keys()
            
            my_deck_co = self.deck_coordinates[deck_name]
        
            all_distances = []
            
            for i in my_keys:
                if (i != deck_name):
                    tmp_dist = self.calculateDistanceBetweenTwoDecks(my_deck_co, self.deck_coordinates[i])
                    
                    tmp_name = i.split(' ')
                        
                    all_distances.append([tmp_dist, tmp_name[1]]) 
           
            nns = sorted(all_distances)
            
            vote_lib = { 'Aggro': 0.0, 'Ramp': 0.0, 'Control': 0.0, 'Mid-Range': 0.0}
            knns = nns[0:k]
            for i in knns:
                vote_lib[i[1]]+=1
            
            top_vote = 0.0
            top_style = ""
            for a in vote_lib:
                if vote_lib[a] > top_vote:
                    top_vote = vote_lib[a]
                    top_style = a

            tmp = deck_name.split(' ')
            archetype_deckname = tmp[1]
            
            if archetype_deckname == top_style:
                same_archetype = True
            else:
                same_archetype = False
            prepared_info = {
                "name": tmp[0],
                "original": tmp[1], 
                "classified":top_style
            }

            return(same_archetype, prepared_info)   
 
        
        def findKNNForAll(self, k, show_option):
            correct = 0
            wrong = 0
            my_keys = self.deck_coordinates.keys()
            gathered_info = []
            for i in my_keys:
                result, info = self.findKNN(k, i, show_option)
                gathered_info.append(info)
                if result:
                    correct +=1
                else:
                    wrong +=1
                    
            return round((correct+0.0)/(correct+wrong), 4)*100, gathered_info

            
        def getDeckByName(self, deckname):
            for i in self.deck_and_name:
                tmp = i[1].split(' ')
                n = i[2]+'_'+i[3]+' '+tmp[1]
                if n == deckname:
                    return i
                
        def prepareDecknames(self):
            for i in self.deck_and_name:
                tmp = i[1].split(' ')
                n = i[2]+'_'+i[3]+' '+tmp[1]
                self.decknames.append(n)
                
        def getArchetypeLibForCard(self, cardname, deckname):
            archetype_lib = { 'Aggro': 0, 'Ramp': 0, 'Control': 0, 'Mid-Range': 0}
            for i in self.deck_and_name:
                tmp = i[1].split(' ')
                n = i[2]+'_'+i[3]+' '+tmp[1]
                if (cardname in i[0] and n != deckname):
                    archetype_lib[tmp[1]]+=1
                    
            return archetype_lib
        
        def normalizeArchetypeCardLib(self, archetype_card_lib):
            total = 0
            for i in archetype_card_lib.keys():
                total+=archetype_card_lib[i]
                
            if total ==0:
                return { 'Aggro': 0.0, 'Ramp': 0.0, 'Control': 0.0, 'Mid-Range': 0.0}
            
            else:
                return { 'Aggro': round((archetype_card_lib['Aggro']+0.0)/total, 4), 
                        'Ramp': round((archetype_card_lib['Ramp']+0.0)/total, 4),
                        'Control':round((archetype_card_lib['Control']+0.0)/total, 4),
                        'Mid-Range':round((archetype_card_lib['Mid-Range']+0.0)/total, 4)
                       }
                
        def findArchetypeByCardVote(self, deckname):
            my_deck = self.getDeckByName(deckname)
            card_lib_deck = {}
            for i in my_deck[0]:
                card_lib_deck[i] = self.getArchetypeLibForCard(i, deckname)
                
            normalized_lib = {}
            for i in card_lib_deck.keys():
                normalized_lib[i] = self.normalizeArchetypeCardLib(card_lib_deck[i])
                
            vote_lib = { 'Aggro': 0.0, 'Ramp': 0.0, 'Control': 0.0, 'Mid-Range': 0.0}
            for n in normalized_lib:
                for a in normalized_lib[n]:
                    vote_lib[a]+=normalized_lib[n][a]
                    
            top_vote = 0.0
            top_style = ""
            for a in vote_lib:
                if vote_lib[a] > top_vote:
                    top_vote = vote_lib[a]
                    top_style = a
                    
            #print(top_vote, top_style)

            tmp = deckname.split(' ')
            archetype_deckname = tmp[1]
            
            if archetype_deckname == top_style:
                same_archetype = True
            else:
                same_archetype = False
            prepared_info = {
                "name": tmp[0],
                "original": tmp[1], 
                "classified":top_style
            }

            return(same_archetype, prepared_info)
        
        def findArchetypeByCardVoteAll(self):         
            correct = 0
            wrong = 0
            gathered_info = []
            for i in self.decknames:
                result, info = self.findArchetypeByCardVote(i)
                gathered_info.append(info)
                if result:
                    correct +=1
                else:
                    wrong +=1
            # print(gathered_info) 
            return round((correct+0.0)/(correct+wrong), 4)*100, gathered_info
        
        def getCenterParameter(self, my_array):
            my_array = sorted(my_array)
            l = len(my_array)
            if l % 2 == 1:
                return round(my_array[l/2 -1], 4)
            else:
                return round((my_array[l/2 -1] + my_array[l/2])/2, 4)
        
        def findArchetypeByCenterPointAll(self):
            styles = list(set(self.normalized_df['Name']))
            styles_dict = {}
            
            # build styles library with separated data frames
            
            for style in styles:
                is_style = (self.normalized_df['Name'] == style) 
                styles_dict[style] = self.normalized_df[is_style]
                
            # find parameters of center point for each style
            
            center_points = {}
            for i in styles_dict:
                center_points[i] = {}
                for k in styles_dict[i]:
                    if k not in ['Name', 'Player and draft']:
                        tmp = list(styles_dict[i][k])
                        center_points[i][k] = self.getCenterParameter(tmp)
            # print(center_points)
            
            coordinates_seq = self.normalized_pre_df.keys()
            
            center_point_coordinates = {}
            for i in center_points:
                center_point_coordinates[i] = []
                for k in coordinates_seq:
                    if k not in ['Name', 'Player and draft']:
                        center_point_coordinates[i].append(center_points[i][k])
                
            #print(center_point_coordinates)
            
            # classify by comparison to this point
            
            correct = 0
            wrong = 0
            gathered_info = []
            
            for deck in self.deck_coordinates:
                min_dist = 100.0
                closest_style = 'None'
                for point in center_point_coordinates:
                    co_deck = self.deck_coordinates[deck]
                    co_point = center_point_coordinates[point]
                    tmp_dist = self.calculateDistanceBetweenTwoDecks(co_deck, co_point)
                    if tmp_dist < min_dist:
                        min_dist = tmp_dist
                        closest_style = point
                tmp = deck.split(' ')
                prepared_info = {
                "name": tmp[0],
                "original": tmp[1], 
                "classified":closest_style
                }
                gathered_info.append(prepared_info)
                if tmp[1] == closest_style:
                    correct+=1
                else:
                    wrong+=1
            #print(gathered_info)        
            return round((correct+0.0)/(correct+wrong), 4)*100, gathered_info
        
        def extractArchetypesForParam(self, name_list):
            archetype_lib = { 'Aggro': 0, 'Ramp': 0, 'Control': 0, 'Mid-Range': 0}
            l = len(name_list)
            for i in name_list:
                tmp = i.split(' ')
                archetype_lib[tmp[1]]+=1
            
            for key in archetype_lib:
                archetype_lib[key] = round((archetype_lib[key]+0.0)/l, 4)
            
            return archetype_lib
            
        
        def findArchetypeParametersVoteMethod(self, deck):
            co_deck = self.deck_coordinates[deck]
            param_num = len(co_deck)
            
            archetype_lib = { 'Aggro': 0, 'Ramp': 0, 'Control': 0, 'Mid-Range': 0}
            
            param_vote = []
            for i in range(0, param_num):
                param_vote.append([[], 100.0])
                
            for d in self.decknames:
                if d !=deck:
                    for p in range(0, param_num):
                        d_param = self.deck_coordinates[d][p]
                        c_param = co_deck[p]
                        dist = abs(d_param - c_param)
                        if dist < param_vote[p][1]:
                            param_vote[p] = [[d], round(dist, 4)]
                        if dist == param_vote[p][1]:
                            param_vote[p][0].append(d)
                            
            for p in param_vote:
                archetype_lib_param = self.extractArchetypesForParam(p[0])
                for key in archetype_lib:
                    archetype_lib[key] += archetype_lib_param[key]
            
            # find archetype with most votes
            max_style = ''
            max_style_value = 0.0
            for key in archetype_lib:
                if archetype_lib[key] > max_style_value:
                    max_style_value = archetype_lib[key]
                    max_style = key
            
            tmp = deck.split(' ')
            archetype_deckname = tmp[1]
            
            if archetype_deckname == max_style:
                same_archetype = True
            else:
                same_archetype = False
            prepared_info = {
                "name": tmp[0],
                "original": tmp[1], 
                "classified":max_style
            }
                
            return(same_archetype, prepared_info)
        
        def findArchetypeByParametersVoteAll(self):         
            correct = 0
            wrong = 0
            gathered_info=[]
            for i in self.decknames:
                result, info = self.findArchetypeParametersVoteMethod(i)
                gathered_info.append(info)
                if result:
                    correct +=1
                else:
                    wrong +=1
            #print(gathered_info)        
            return round((correct+0.0)/(correct+wrong), 4)*100, gathered_info
        
        def convertCardToTags(self, cardname):
            return self.cardtag_dict[cardname]
        
        def convertDeckToTags(self, deck, chosen_tags):
            converted = []
            for d in deck:
                tmp = self.convertCardToTags(d)
                for t in tmp:
                    if t in chosen_tags:
                        converted.append(t)
                    
            return sorted(converted)
        
        def convertTagCloudToDict(self, tags):
            tags_for_deck = {}
            for t in tags:
                if t not in tags_for_deck:
                    tags_for_deck[t] = 1
                else:
                    tags_for_deck[t] +=1
                    
            return(tags_for_deck)
        
        def preNormalizeDeckTagDict(self, tags_for_deck, number_of_cards):
            prenormalized_dict = {}
            for t in tags_for_deck.keys():
                prenormalized_dict[t] = round((tags_for_deck[t] +0.0) / number_of_cards, 2)
            return prenormalized_dict
        
        def normalizeDeckTagDict(self, prenormalized_dict, tag_min_max):
            normalized_dict = {}
            for t in prenormalized_dict.keys():
                x = prenormalized_dict[t]
                max_val = tag_min_max[t]["max_val"]
                min_val = tag_min_max[t]["min_val"]
                x1 = x - min_val
                max1 = max_val - min_val
                normalized_dict[t] = round(x1/max1, 2)
            return normalized_dict     
        
        def convertAllDecksToTags(self, chosen_tags):
            self.name_and_tags = {}
            for deck in self.deck_and_name:
                my_tags = self.convertDeckToTags(deck[0], chosen_tags)
                tags_for_deck = self.convertTagCloudToDict(my_tags)
                pn = self.preNormalizeDeckTagDict(tags_for_deck, len(deck[0]))
                self.name_and_tags[deck[2]+'_'+deck[3]] = pn
                
            tag_min_max = {}
            for ct in chosen_tags:
                tag_min_max[ct] = { "min_val": 1.0, "max_val": 0.0}
                   
            for deck in self.name_and_tags.keys():
                tags = self.name_and_tags[deck]
                for t in tags.keys():
                    if self.name_and_tags[deck][t] > tag_min_max[t]["max_val"]:
                        tag_min_max[t]["max_val"] = self.name_and_tags[deck][t]
                    if t not in tag_min_max.keys():
                        tag_min_max[t]["min_val"] = 0.0
                    if self.name_and_tags[deck][t] < tag_min_max[t]["min_val"]:
                        tag_min_max[t]["min_val"] = self.name_and_tags[deck][t]
                        
            self.name_and_normalized_tags = {}
            for deck in self.name_and_tags.keys():
                n = self.normalizeDeckTagDict(self.name_and_tags[deck], tag_min_max)
                self.name_and_normalized_tags[deck] = n    
                
        def calculateTagDistance(self, deck1, deck2):
            deck1_tags = deck1.keys()
            deck2_tags = deck2.keys()
            
            my_sum = 0.0 #Euclidean
            
            for t in deck1_tags:
                if t not in deck2_tags:
                    my_sum +=(deck1[t])**2
                    
            for t in deck2_tags:
                if t not in deck1_tags:
                    my_sum +=(deck2[t])**2
            
            inter = list(set(deck1_tags).intersection(deck2_tags))
            
            for t in inter:
                t_dist =((deck1[t] - deck2[t]))**2
                my_sum+=t_dist
                
            return math.sqrt(my_sum)
                
                
        def findTagKNN(self, k, deck_name):
            tmp = deck_name.split(' ')
            
            my_keys = self.deck_coordinates.keys()
            
            my_deck_co = self.name_and_normalized_tags[tmp[0]]
        
            all_distances = []
            
            for i in my_keys:
                if (i != deck_name):
                    tmp_name = i.split(' ')
                    tmp_dist = self.calculateTagDistance(my_deck_co, self.name_and_normalized_tags[tmp_name[0]])   
                    all_distances.append([tmp_dist, tmp_name[1]]) 
           
            nns = sorted(all_distances)
            
            vote_lib = { 'Aggro': 0.0, 'Ramp': 0.0, 'Control': 0.0, 'Mid-Range': 0.0}
            knns = nns[0:k]
            for i in knns:
                vote_lib[i[1]]+=1
            
            top_vote = 0.0
            top_style = ""
            for a in vote_lib:
                if vote_lib[a] > top_vote:
                    top_vote = vote_lib[a]
                    top_style = a

            archetype_deckname = tmp[1]
            
            if archetype_deckname == top_style:
                same_archetype = True
            else:
                same_archetype = False
            prepared_info = {
                "name": tmp[0],
                "original": tmp[1], 
                "classified":top_style
            }

            return(same_archetype, prepared_info)
        
        def findArchetypeByTaggedKnnAll(self, k):
            correct = 0
            wrong = 0
            my_keys = self.deck_coordinates.keys()
            gathered_info = []
            for i in my_keys:
                result, info = self.findTagKNN(k, i)
                gathered_info.append(info)
                if result:
                    correct +=1
                else:
                    wrong +=1
                    
            return round((correct+0.0)/(correct+wrong), 4)*100, gathered_info
        
        def getTagCenterPoint(self, style):
            selected_decks = []
            for d in self.deck_and_name:
                tmp = d[1].split(' ')
                if tmp[1] == style:
                    selected_decks.append(self.name_and_normalized_tags[d[2]+"_"+d[3]])
                    
            number_of_decks = len(selected_decks)
            
            summed_tags = {}
            for sd in selected_decks:
                for t in sd.keys():
                    if t not in summed_tags:
                        summed_tags[t] = sd[t]
                    else:
                        summed_tags[t] +=sd[t]
                        
            average_tags = {}
            for st in summed_tags.keys():
                average_tags[st] = round(summed_tags[st] / number_of_decks, 2)
                
            return average_tags
        
        def findArchetypeByTagCenterPointAll(self):
            styles = list(set(self.normalized_df['Name']))
            styles_dict = {}
            
            # build styles library with separated data frames
            
            for style in styles:
                is_style = (self.normalized_df['Name'] == style) 
                styles_dict[style] = self.normalized_df[is_style]
                
            # find parameters of center point for each style
            
            center_point = {}
            
            for i in styles_dict:
                center_point[i] = self.getTagCenterPoint(i)
            
            # classify by comparison to this point
            
            correct = 0
            wrong = 0
            gathered_info = []
            
            for deck in self.deck_coordinates.keys():
                min_dist = 100.0
                closest_style = 'None'
                tmp = deck.split(' ')
                
                for point in center_point.keys():
                    tag_deck = self.name_and_normalized_tags[tmp[0]]
                    tag_point = center_point[point]
                    tmp_dist = self.calculateTagDistance(tag_deck, tag_point)
                    if tmp_dist < min_dist:
                        min_dist = tmp_dist
                        closest_style = point
                
                prepared_info = {
                "name": tmp[0],
                "original": tmp[1], 
                "classified":closest_style
                }
                gathered_info.append(prepared_info)
                if tmp[1] == closest_style:
                    correct+=1
                else:
                    wrong+=1
            #print(gathered_info)      
            return round((correct+0.0)/(correct+wrong), 4)*100, gathered_info
        
        def getMinMaxMedianDistance(self):
            all_distances = []
            decks = self.deck_coordinates.keys()
            
            for i in range(0, len(decks)):
                for j in range(i+1, len(decks)):
                    d1 = self.deck_coordinates[decks[i]]
                    d2 = self.deck_coordinates[decks[j]]
                    distance = self.calculateDistanceBetweenTwoDecks(d1, d2)
                    if distance == 0.0:
                        print(decks[i], decks[j])
                    all_distances.append(distance)
                    
            return min(all_distances), max(all_distances), np.median(all_distances)
        
        def findMinimumDistance(self, clusters):
            min_dist = 100
            cluster_id1 = "None"
            cluster_id2 = "None"
            
            for c1 in clusters.keys():
                for c2 in clusters.keys():
                    if c1 != c2:
                        cp1 = clusters[c1]['center_point']
                        cp2 = clusters[c2]['center_point']
                        distance = self.calculateDistanceBetweenTwoDecks(cp1, cp2)
                        if distance < min_dist:
                            min_dist = distance
                            cluster_id1 = c1
                            cluster_id2 = c2
                            
            return min_dist, cluster_id1, cluster_id2
              
        def calculateCenterPointForDecks(self, decknames):
            
            l = len(decknames)
            all_coordinates = []
            for d in decknames:
                all_coordinates.append(self.deck_coordinates[d])
                
            sum_coordinates = all_coordinates[0]
            for i in range(1, l):
                for j in range(0, len(sum_coordinates)):
                    sum_coordinates[j] += all_coordinates[i][j]
                    
            for i in range(0, len(sum_coordinates)):
                sum_coordinates[i] = round(sum_coordinates[i] / l, 4)
                
            return sum_coordinates
        
        def mergeClusters(self, cluster_id1, cluster_id2, clusters):
            chosen_id = ""
            cl1 = int(cluster_id1)
            cl2 = int(cluster_id2)
            if cl1 < cl2:
                chosen_id = str(cl1)
                to_delete = str(cl2)
            else:
                chosen_id = str(cl2)
                to_delete = str(cl1)
            
            new_decks = []
            for i in clusters[cluster_id1]['decks']:
                new_decks.append(i)
            for i in clusters[cluster_id2]['decks']:
                new_decks.append(i)
                
            new_cp = self.calculateCenterPointForDecks(new_decks)
            new_depth = 1+ self.getMaxDepth(clusters, cluster_id1, cluster_id2)
            clusters[chosen_id] = { 'decks': new_decks, 'center_point': new_cp, 
                                   'depth': new_depth}
            
            del clusters[to_delete]

            return clusters
        
        def getMaxDepth(self, clusters, cluster_id1, cluster_id2):
            d1 = clusters[cluster_id1]['depth']
            d2 = clusters[cluster_id2]['depth']
            if d1 >= d2: return d1
            else: return d2
            
        def getMinId(self, cl1, cl2):
            if int(cl1) < int(cl2): return cl1
            else: return cl2
            
        def getNodeColorStyle(self, decks):
            style_color = {
                'Aggro': 'firebrick1',
                'Ramp': 'darkolivegreen2',
                'Mid-Range': 'darkorchid1',
                'Control': 'deepskyblue1',
                'Not defined': 'gray67'
            }
            
            ramp = 0
            control = 0
            midrange = 0
            aggro = 0
            
            for d in decks:
                tmp = d.split(' ')
                if tmp[1] == 'Ramp': ramp+=1
                if tmp[1] == 'Control': control+=1
                if tmp[1] == 'Mid-Range': midrange+=1
                if tmp[1] == 'Aggro': aggro+=1
                    
            style = 'Not defined'
                    
            if ramp > control and ramp > midrange and ramp > aggro:
                style = 'Ramp'
            if control > ramp and control > midrange and control > aggro:
                style = 'Control'
            if midrange > ramp and midrange > control and midrange > aggro:
                style = 'Mid-Range'
            if aggro > ramp and aggro > control and aggro > midrange:
                style = 'Aggro'
                
            return style, style_color[style]
            
        def getNodeColor(self, decks, mode):
            if mode =='style':
                return self.getNodeColorStyle(decks)
            else:
                return self.getNodeColorPlayer(decks)
            
        
        def saveEdgesAndNodes(self, my_edges, my_nodes, filename_edges, filename_nodes):
            edge_string = ""
            node_string = ""
            for e in my_edges:
                tmp = "|".join(e)
                edge_string+=tmp+"\n"
                
            for n in my_nodes.keys():
                tmp = n+'|'+'|'.join(my_nodes[n])
                node_string+=tmp+"\n"
                
            edge_file = open(filename_edges, 'w')
            node_file = open(filename_nodes, 'w')
            edge_file.write(edge_string)
            node_file.write(node_string)
            edge_file.close()
            node_file.close()
            
        def getAllPlayers(self):
            decknames = self.deck_coordinates.keys()
            players = []
            for d in decknames:
                tmp = d.split('_')
                players.append(tmp[0])
                
            all_players = sorted(list(set(players)))
            return all_players
            
        def getPlayerColorDict(self):
            players = self.getAllPlayers()
            color_filename = 'node_colors.txt'
            my_file = open(color_filename, 'r')
            my_string = my_file.read()
            my_list = my_string.split('\n')
            self.player_color = {'Not defined': 'gray67'}
            for i in range(0, len(players)):
                self.player_color[players[i]] = my_list[i]
                
        def getNodeColorPlayer(self, decks):

            players = {}
            
            for d in decks:
                tmp = d.split('_')
                if tmp[0] not in players:
                    players[tmp[0]] = 1
                else:
                    players[tmp[0]] += 1
                    
            max_decks = 0
            max_player = []
            
            for p in players.keys():
                if players[p] == max_decks:
                    max_player.append(p)
                elif players[p] > max_decks:
                    max_player = []
                    max_player.append(p)
                    max_decks = players[p]
                else:
                    pass
                
            if len(max_player) == 1:
                player = max_player[0]
            else:
                player = 'Not defined'
                
            return player, self.player_color[player]
                             
        def unsupervisedClusteringByParameters(self, max_number_of_clusters, mode):
            clusters = {}
            decknames = self.deck_coordinates.keys()
            self.getPlayerColorDict()
            
            for i in range(0, len(decknames)):
                clusters[str(i)] = { 'decks': [decknames[i]], 
                                    'center_point': self.deck_coordinates[decknames[i]],
                                   'depth': 1}
                
            min_dist, max_dist, median_dist = self.getMinMaxMedianDistance()
            current_min = min_dist
            current_clusters = len(decknames)
            
            # graph material

            edges = []
            nodes = {}
            
            for c in clusters.keys():
                style, color = self.getNodeColor(clusters[c]['decks'], mode)
                nodes[c+'_'+str(1)] = [clusters[c]['decks'][0], style, color, str(1)]
            
            while(current_clusters > 1):
                current_min, cluster_id1, cluster_id2 = self.findMinimumDistance(clusters)
                                
                st1, col1 = self.getNodeColor(clusters[cluster_id1]['decks'], mode)
                st2, col2 = self.getNodeColor(clusters[cluster_id2]['decks'], mode)
                ld1 = len(clusters[cluster_id1]['decks'])
                ld2 = len(clusters[cluster_id2]['decks'])
                nld = ld1+ld2
                
                new_log = 'Cluster '+cluster_id1+' '+st1+' ('+str(ld1)+' decks)'+' joined with cluster '
                new_log+= cluster_id2+' '+st2+' ('+str(ld2)+' decks) with distance '+str(round(current_min, 4))+'. Total decks in new cluster: '+str(nld)
                
                print(new_log)
                
                new_depth = 1 + self.getMaxDepth(clusters, cluster_id1, cluster_id2)
                node_id1 = cluster_id1+'_'+str(clusters[cluster_id1]['depth'])
                node_id2 = cluster_id2+'_'+str(clusters[cluster_id2]['depth'])
                edges.append([node_id1, node_id2, 'none', str(round(current_min, 4)), 'crimson'])
                
                clusters = self.mergeClusters(cluster_id1, cluster_id2, clusters)
                current_clusters = len(clusters.keys())
                
                # generate graphviz data
                new_id = self.getMinId(cluster_id1, cluster_id2)
                new_node_id = new_id+'_'+str(new_depth)
                style, color = self.getNodeColor(clusters[new_id]['decks'], mode)
                edges.append([node_id1, new_node_id, 'arrow', 'no label', 'black'])
                edges.append([node_id2, new_node_id, 'arrow', 'no label', 'black'])
                nodes[new_node_id] = [new_id+' '+style+' ('+str(nld)+')', style, color, str(new_depth)]
                
            self.saveEdgesAndNodes(edges, nodes, 'clustering_edges.txt', 'clustering_nodes.txt')
                
        def getMinMaxMedianDistanceTags(self):
            all_distances = []
            decks = self.deck_coordinates.keys()
            
            for i in range(0, len(decks)):
                itmp = decks[i].split(' ')
                for j in range(i+1, len(decks)):
                    jtmp = decks[j].split(' ')
                    d1 = self.name_and_normalized_tags[itmp[0]]
                    d2 = self.name_and_normalized_tags[jtmp[0]]
                    distance = self.calculateTagDistance(d1, d2)
                    if distance == 0.0:
                        print(decks[i], decks[j])
                    all_distances.append(distance)
                    
            return min(all_distances), max(all_distances), np.median(all_distances)
        
        def findMinimumTagDistance(self, clusters):
            min_dist = 100
            cluster_id1 = "None"
            cluster_id2 = "None"
            
            for c1 in clusters.keys():
                for c2 in clusters.keys():
                    if c1 != c2:
                        cp1 = clusters[c1]['center_point']
                        cp2 = clusters[c2]['center_point']
                        distance = self.calculateTagDistance(cp1, cp2)
                        if distance < min_dist:
                            min_dist = distance
                            cluster_id1 = c1
                            cluster_id2 = c2
                            
            return min_dist, cluster_id1, cluster_id2
        
        
        def calculateCenterPointForDecksTag(self, decknames):
            number_of_decks = len(decknames)
            summed_tags = {}
            for d in decknames:
                tmp = d.split(' ')
                for t in self.name_and_normalized_tags[tmp[0]].keys():
                    if t not in summed_tags:
                        summed_tags[t] = self.name_and_normalized_tags[tmp[0]][t]
                    else:
                        summed_tags[t] += self.name_and_normalized_tags[tmp[0]][t]
                        
            average_tags = {}
            for st in summed_tags.keys():
                average_tags[st] = round(summed_tags[st] / number_of_decks, 2)
                
            return average_tags
        
        def mergeClustersTag(self, cluster_id1, cluster_id2, clusters):
            chosen_id = ""
            cl1 = int(cluster_id1)
            cl2 = int(cluster_id2)
            if cl1 < cl2:
                chosen_id = str(cl1)
                to_delete = str(cl2)
            else:
                chosen_id = str(cl2)
                to_delete = str(cl1)
            
            new_decks = []
            for i in clusters[cluster_id1]['decks']:
                new_decks.append(i)
            for i in clusters[cluster_id2]['decks']:
                new_decks.append(i)
                
            new_cp = self.calculateCenterPointForDecksTag(new_decks)
            new_depth = 1+ self.getMaxDepth(clusters, cluster_id1, cluster_id2)
            clusters[chosen_id] = { 'decks': new_decks, 'center_point': new_cp, 'depth': new_depth }
            
            del clusters[to_delete]

            return clusters
            
        def unsupervisedClusteringByTags(self, max_number_of_clusters, mode):
            clusters = {}
            decknames = self.deck_coordinates.keys()
            self.getPlayerColorDict()
            
            for i in range(0, len(decknames)):
                tmp = decknames[i].split(' ')
                clusters[str(i)] = { 'decks': [decknames[i]], 
                                    'center_point': self.name_and_normalized_tags[tmp[0]],
                                   'depth': 1}
            min_dist, max_dist, median_dist = self.getMinMaxMedianDistanceTags()
            
            current_min = min_dist
            current_clusters = len(decknames)
            
            # graph material
            edges = []
            nodes = {}
            
            for c in clusters.keys():
                style, color = self.getNodeColor(clusters[c]['decks'], mode)
                nodes[c+'_'+str(1)] = [clusters[c]['decks'][0], style, color, str(1)]
            
            while(current_clusters > 1):
                current_min, cluster_id1, cluster_id2 = self.findMinimumTagDistance(clusters)
                
                st1, col1 = self.getNodeColor(clusters[cluster_id1]['decks'], mode)
                st2, col2 = self.getNodeColor(clusters[cluster_id2]['decks'], mode)
                ld1 = len(clusters[cluster_id1]['decks'])
                ld2 = len(clusters[cluster_id2]['decks'])
                nld = ld1+ld2
                
                new_log = 'Cluster '+cluster_id1+' '+st1+' ('+str(ld1)+' decks)'+' joined with cluster '
                new_log+= cluster_id2+' '+st2+' ('+str(ld2)+' decks) with distance '+str(round(current_min, 4))+'. Total decks in new cluster: '+str(nld)
                
                print(new_log)
                
                new_depth = 1 + self.getMaxDepth(clusters, cluster_id1, cluster_id2)
                node_id1 = cluster_id1+'_'+str(clusters[cluster_id1]['depth'])
                node_id2 = cluster_id2+'_'+str(clusters[cluster_id2]['depth'])
                edges.append([node_id1, node_id2, 'none', str(round(current_min, 4)), 'crimson'])
                
                clusters = self.mergeClustersTag(cluster_id1, cluster_id2, clusters)
                current_clusters = len(clusters.keys())
                
                # generate graphviz data
                new_id = self.getMinId(cluster_id1, cluster_id2)
                new_node_id = new_id+'_'+str(new_depth)
                style, color = self.getNodeColor(clusters[new_id]['decks'], mode)
                edges.append([node_id1, new_node_id, 'arrow', 'no label', 'black'])
                edges.append([node_id2, new_node_id, 'arrow', 'no label', 'black'])
                nodes[new_node_id] = [new_id+' '+style+' ('+str(nld)+')', style, color, str(new_depth)]
                
            self.saveEdgesAndNodes(edges, nodes, 'tag_clustering_edges.txt', 'tag_clustering_nodes.txt')
                
        def findsubsets(self, m):
            return list(set(itertools.combinations(set(self.all_params), m)))
        
        def saveAllGatheredInfo(self, all_info):
            to_save = {}
            onInfo = all_info["onInfo"]
            for i in onInfo:
                to_save[i["name"]] = {"original":i["original"], "on_class":i["classified"]}
                
            cvInfo = all_info["cvInfo"]
            cpInfo = all_info["cpInfo"]
            pvInfo = all_info["pvInfo"]
            knnInfo = all_info["knnInfo"]
            tknnInfo = all_info["tknnInfo"]
            tagcpInfo = all_info["tagcpInfo"]
            
            for i in cvInfo:
                to_save[i["name"]]["cv_class"]=i["classified"]
                
            for i in cpInfo:
                to_save[i["name"]]["cp_class"]=i["classified"]
            
            for i in pvInfo:
                to_save[i["name"]]["pv_class"]=i["classified"]
                
            for i in knnInfo:
                to_save[i["name"]]["knn_class"]=i["classified"]
                
            for i in tknnInfo:
                to_save[i["name"]]["tknn_class"]=i["classified"]
                
            for i in tagcpInfo:
                to_save[i["name"]]["tagcp_class"]=i["classified"]
            
            header="Name,Original,OneNeighbour,CardVote,CenterPoint,"
            header+="ParameterVote,KNN11,TagKNN11,TagCenterPoint\n"
            for i in to_save.keys():
                line=""
                line+=i+","+to_save[i]["original"]+','+to_save[i]["on_class"]+','+to_save[i]["cv_class"]+','
                line+=to_save[i]["cp_class"]+','+to_save[i]["pv_class"]+','+to_save[i]["knn_class"]
                line+=','+to_save[i]["tknn_class"]+','+to_save[i]["tagcp_class"]+'\n'
                header+=line
            
            my_file=open("all_info_classified.csv", "w")
            my_file.write(header)
            my_file.close()
            
            
        def compareMethodsAccuracy(self):
            
            # Method 1 - Closest neighbour 
            oneNeighbour, onInfo = self.findClosestDeckForEachDeck(False)
            print(oneNeighbour)
            
            # Mehod 2 - Card Vote Method - no parameters needed
            cardVote, cvInfo = self.findArchetypeByCardVoteAll()
            print(cardVote)
            
            # Method 3 - Center point method
            closestCenter, cpInfo = self.findArchetypeByCenterPointAll()
            print(closestCenter)
            
            # Method 4 - Parameters Vote method
            parameterVote, pvInfo = self.findArchetypeByParametersVoteAll()
            print(parameterVote)
            
            #Method 5 - K-Nearest-Neighbours
            knn, knnInfo = self.findKNNForAll(11, True)
            print(knn)
            
            #Method 6 - Tagged K-Nearest-Neighbours
            tknn, tknnInfo = self.findArchetypeByTaggedKnnAll(11)
            print(tknn)
            
            #Method 7 - Tagged Center Point 
            tagcp, tagcpInfo = workshop.findArchetypeByTagCenterPointAll()
            print(tagcp)
            
            
            self.saveAllGatheredInfo({
                "onInfo":onInfo,
                "cvInfo":cvInfo,
                "cpInfo":cpInfo,
                "pvInfo":pvInfo,
                "knnInfo":knnInfo,
                "tknnInfo":tknnInfo,
                "tagcpInfo":tagcpInfo
            })
                                
workshop = ArchetypeWorkshop()

workshop.loadAllDecksWithAll() # load all deck data

#workshop.loadAllCardsData() - not used due to issues with handling this data frame
#workshop.prepareAllUsedCardsData()  - not used due to issues with handling this data frame
#workshop.saveUsedCardsAsJson()  - not used due to issues with handling this data frame

# Preparation of parameters:

selected_params =  ['AverageCMC',
                'AverageCMCCreatures',
                'CreatureDensity',
                'PowerToToughness',
                'PowerToCMC',
                'ToughnessToCMC',
                'MaxCMC',
                'AveragePower',
                'AverageToughness']


count_and_tag = [[269, 'aggressive'], [204, 'evasive'], [157, 'fixing'], 
                 [126, 'tokens'], [126, 'land'], [122, 'artifact'], [114, 'etb'],
                 [106, 'burn-player'], [105, 'draw'], [92, 'ramp'], [92, 'burn-creature'],
                 [86, 'removal-creature'], [75, 'durable'], [73, 'pump'], 
                 [67, 'destroy-artifact'], [67, 'card-selection'], [64, 'recursion'],
                 [59, 'counters'], [57, 'mass-removal-creature'], [57, 'lifegain'],
                 [53, 'planeswalker'], [51, 'enchantment'], [50, 'destroy-enchantment'],
                 [42, 'destroy-planeswalker'], [41, 'huge'], [38, 'artifact-creature'],
                 [37, 'disruptive'], [37, 'discard-self'], [34, 'counterspell'], 
                 [33, 'destroy-land'], [32, 'discard-opponent'], [30, 'tutor'],
                 [27, 'defensive'], [26, 'sacrifice-outlet'], [22, 'bounce'],
                 [21, 'reanimation'], [21, 'cheat'], [19, 'equipment'], [17, 'creature-land'],
                 [16, 'steal-creature'], [14, 'tap'], [13, 'edict'], [12, 'mill'], [12, 'anthem'],
                 [11, 'clone-creature'], [10, 'mass-removal-artifact'], [7, 'mass-removal-lands'],
                 [7, 'mass-removal-enchantment'], [6, 'blink'], [5, 'mass-removal-planeswalker'],
                 [5, 'clone-artifact'], [4, 'steal-planeswalker'], [4, 'mass-bounce'],
                 [3, 'mass-tap'], [2, 'turn'], [2, 'transform'], [2, 'steal-artifact'],
                 [2, 'redirect'], [2, 'clone-sorcery'], [2, 'clone-planeswalker'],
                 [2, 'clone-instant'], [1, 'steal-turn'], [1, 'steal-land'], 
                 [1, 'steal-enchantment'], [1, 'restart-game'], [1, 'mass-reanimation'], 
                 [1, 'enchantement'], [1, 'clone-enchantment'], [1, 'clone-ability']]

selected_tags = []
for j in range(0, 17):
    selected_tags.append(count_and_tag[j][1])
    
#three_params = (workshop.findsubsets(10))
"""
for i in three_params:
    print list(i)
    workshop.examineAllDecks(selected_params, selected_tags)
    #workshop.examineAllDecks(list(i)) # prepare parameters for each deck
    workshop.convertResultsToDataFrame() # prepare data frame for further usage
    workshop.normalizeDataFrame()
    workshop.convertNormalizedPreDfToCoordinates()
    workshop.compareMethodsAccuracy()
"""

workshop.examineAllDecks(selected_params, selected_tags)
workshop.convertResultsToDataFrame() # prepare data frame for further usage
workshop.normalizeDataFrame()
workshop.convertNormalizedPreDfToCoordinates()
#workshop.compareMethodsAccuracy()

#workshop.unsupervisedClusteringByParameters(1, 'player')
workshop.unsupervisedClusteringByTags(1, 'player')

# Parameters visual analysis:
#workshop.showDistributionForEachParameter() # show parameters distribution
#workshop.showScatterPlotsForParameterPairs() # show parameteres possible correlations

# unsupervised classification - parameters and players
# unsupervised classification - tags and players



Cluster 323 Casc (1 decks) joined with cluster 325 Casc (1 decks) with distance 0.3198. Total decks in new cluster: 2
Cluster 216 Kasia (1 decks) joined with cluster 198 Wielki (1 decks) with distance 0.3572. Total decks in new cluster: 2
Cluster 283 Marek (1 decks) joined with cluster 183 Wojtek (1 decks) with distance 0.4024. Total decks in new cluster: 2
Cluster 53 Wielki (1 decks) joined with cluster 254 Wielki (1 decks) with distance 0.4204. Total decks in new cluster: 2
Cluster 159 JacekK (1 decks) joined with cluster 77 Bartek (1 decks) with distance 0.4204. Total decks in new cluster: 2
Cluster 155 Kuba (1 decks) joined with cluster 71 Kuba (1 decks) with distance 0.421. Total decks in new cluster: 2
Cluster 139 Bartek (1 decks) joined with cluster 282 Bartek (1 decks) with distance 0.426. Total decks in new cluster: 2
Cluster 88 Lukasz (1 decks) joined with cluster 122 Aneta (1 decks) with distance 0.4266. Total decks in new cluster: 2
Cluster 193 Casc (1 decks) joined with cl

Cluster 188 Casc (1 decks) joined with cluster 78 Casc (1 decks) with distance 0.5121. Total decks in new cluster: 2
Cluster 84 MichalL (1 decks) joined with cluster 198 Not defined (3 decks) with distance 0.5134. Total decks in new cluster: 4
Cluster 179 Wielki (1 decks) joined with cluster 225 Casc (1 decks) with distance 0.5139. Total decks in new cluster: 2
Cluster 30 Not defined (4 decks) joined with cluster 13 Casc (15 decks) with distance 0.5143. Total decks in new cluster: 19
Cluster 13 Casc (19 decks) joined with cluster 47 MichalM (1 decks) with distance 0.5081. Total decks in new cluster: 20
Cluster 150 Kuba (1 decks) joined with cluster 236 Jozef (1 decks) with distance 0.515. Total decks in new cluster: 2
Cluster 65 Casc (1 decks) joined with cluster 314 Casc (1 decks) with distance 0.517. Total decks in new cluster: 2
Cluster 34 Not defined (2 decks) joined with cluster 176 Kasia (1 decks) with distance 0.518. Total decks in new cluster: 3
Cluster 21 Mateusz (1 decks) joi

Cluster 234 Casc (1 decks) joined with cluster 46 Casc (5 decks) with distance 0.5696. Total decks in new cluster: 6
Cluster 10 Casc (5 decks) joined with cluster 46 Casc (6 decks) with distance 0.5616. Total decks in new cluster: 11
Cluster 10 Casc (11 decks) joined with cluster 153 Casc (1 decks) with distance 0.5726. Total decks in new cluster: 12
Cluster 111 Kamil (1 decks) joined with cluster 243 Casc (3 decks) with distance 0.5738. Total decks in new cluster: 4
Cluster 21 Not defined (32 decks) joined with cluster 49 Not defined (4 decks) with distance 0.5741. Total decks in new cluster: 36
Cluster 21 Not defined (36 decks) joined with cluster 103 Wielki (1 decks) with distance 0.5589. Total decks in new cluster: 37
Cluster 4 Not defined (7 decks) joined with cluster 32 Bartek (5 decks) with distance 0.5745. Total decks in new cluster: 12
Cluster 4 Bartek (12 decks) joined with cluster 239 Bartek (1 decks) with distance 0.5563. Total decks in new cluster: 13
Cluster 4 Bartek (13 

Cluster 3 Bartek (18 decks) joined with cluster 71 Kuba (2 decks) with distance 0.6257. Total decks in new cluster: 20
Cluster 111 Casc (4 decks) joined with cluster 3 Bartek (20 decks) with distance 0.6033. Total decks in new cluster: 24
Cluster 118 Not defined (2 decks) joined with cluster 96 TomekP (1 decks) with distance 0.6291. Total decks in new cluster: 3
Cluster 35 Not defined (2 decks) joined with cluster 17 Not defined (6 decks) with distance 0.6294. Total decks in new cluster: 8
Cluster 17 Wojtek (8 decks) joined with cluster 1 Bartek (85 decks) with distance 0.629. Total decks in new cluster: 93
Cluster 157 Not defined (2 decks) joined with cluster 1 Bartek (93 decks) with distance 0.6277. Total decks in new cluster: 95
Cluster 167 Casc (1 decks) joined with cluster 146 Casc (6 decks) with distance 0.6299. Total decks in new cluster: 7
Cluster 298 Casc (1 decks) joined with cluster 10 Casc (12 decks) with distance 0.6319. Total decks in new cluster: 13
Cluster 97 MichalL (1

Cluster 29 Kuba (1 decks) joined with cluster 102 Kuba (1 decks) with distance 0.7559. Total decks in new cluster: 2
Cluster 135 Not defined (3 decks) joined with cluster 29 Kuba (2 decks) with distance 0.7367. Total decks in new cluster: 5
Cluster 14 Wielki (1 decks) joined with cluster 143 Not defined (2 decks) with distance 0.7567. Total decks in new cluster: 3
Cluster 172 Mateusz (1 decks) joined with cluster 202 Suld (1 decks) with distance 0.7595. Total decks in new cluster: 2
Cluster 172 Not defined (2 decks) joined with cluster 14 Wielki (3 decks) with distance 0.7418. Total decks in new cluster: 5
Cluster 19 Casc (1 decks) joined with cluster 207 TomekP (1 decks) with distance 0.7598. Total decks in new cluster: 2
Cluster 65 Casc (3 decks) joined with cluster 16 SirK (1 decks) with distance 0.7624. Total decks in new cluster: 4
Cluster 69 Casc (1 decks) joined with cluster 73 Casc (1 decks) with distance 0.7694. Total decks in new cluster: 2
Cluster 69 Casc (2 decks) joined wi

In [140]:
# Using the file with saved data all_info_classified.csv:
# 1. find decks that are most miss-classified and re-examine these decks manually
# 2. re-run the classifier to see if it helps

data = pd.read_csv('all_info_classified.csv')

org_vs_on = data["Original"]==data["OneNeighbour"]
org_vs_cv = data["Original"]==data["CardVote"]
org_vs_cp = data["Original"]==data["CenterPoint"]
org_vs_pv = data["Original"]==data["ParameterVote"]
org_vs_knn = data["Original"]==data["KNN11"]
org_vs_tknn = data["Original"]==data["TagKNN11"]
org_vs_tagcp = data["Original"]==data["TagCenterPoint"]

how_many_false = []
for i in range(0, len(org_vs_on)):
    fv = 0
    if org_vs_on[i] == False:
        fv+=1
    if org_vs_cv[i] == False:
        fv+=1
    if org_vs_cp[i] == False:
        fv+=1
    if org_vs_pv[i] == False:
        fv+=1
    if org_vs_knn[i] == False:
        fv+=1
    if org_vs_tknn[i] == False:
        fv+=1
    if org_vs_tagcp[i] == False:
        fv+=1
    how_many_false.append(fv)
    
#print(how_many_false)
    
four_false = []
for i in range(0, len(how_many_false)):
    if how_many_false[i] == 7:
        four_false.append(data["Name"][i])
        
print(four_false)

# At this moment it would be a good step to manually examine all decks that were never classified accurately
# as there is a big chance, that these ones are causing the issues with classifiers accuracy - done in three steps

['Bartek_20_3']


|Data Point / Method | oneNeighbour | cardVote | centerPoint | parameterVote | kNN_11 | Tag kNN11 | tagCenterPoint |
|---|---|---|---|---|---|---|---|
| 19_1|50,2|57,83|58,63|null|null|||
| 19_2|51,36|58,37|60,31|41,25|null|||
| 19_3|51,34|58,62|58,62|41,76|null|||
| 19_4|50,19|58,87|60,0|41,89|null|||
| 19_5|48.89|60,0|60,37|42,96|null|||
| 19_6|49,27|60,22|60,58|40,88|null|||
| 19_7|48,92|60,43|61,5|42,09|null|||
| 19_8|47,89|60,21|58,45|41,9|null|||
| 19_9|47,93|59,66|60,0|42,07|null|||
| 19_10|48,3|60,2|60,2|42,18|null|||
|1st Rev|65,99|69,05|71,09|54,76|null|||
|2nd Rev|72,45|69,05|74,15|58,5|null|||
|3rd Rev|75,17|70,41|76,53|60,54|null|||
| 19_11|73,75|71,43|76,41|60,8|null|||
|4th Rev|74,75|71,1|76,08|61,46|null|||
| 19_12|75,08|71,48|77,05|60,98|null|||
| 19_13|73,95|70,42|75,24|60,45|null|||
|5th Rev|74,6|70,42|76,85|61,09|77,17|||
| 19_14|73,33|70,79|76,51|60,95|77,14|||
| 20_1|73,44|70,31|76,25|61,88|77,55|||
| 20_2|73,23|69,23|75,38|61,23|76,0|||
|6th Rev|74,77|69,23|76,62|62,77|77,23|||
| 20_3|74,85|69,7|76,67|62,73|76,06|||
| 20_4|73,35|70,36|75,75|63,17|75,15|||
|7th Rev|75,15|69,46|75,75|63,77|75,75|69,16|64,97|
|no 15_6|74,55|69,7|75,76|63,33|75,76|70,0|64,85|
|UC Rev|75,15|66,06|75,45|62,42|74,55|65,15|63,33|
|8th Rev|75,76|66,67|76,06|63,33|75,45|66,97|64,85|
|Mis.Deck|76,05|67,37|76,65|64,37|76,35|67,07|64,67|
|Mis.Deck2|75,15|65,87|76,65|61,68|75,75|65,27|66,47|
|9th Rev|76,05|66,17|77,25|62,57|76,35|66,47|65,87|

For all (10) parameters used
For (17) most popular tags used

In [138]:
# 3. see accuracy for each method and archetype combination, for example: how many aggro decks method X 
# identifies as aggro and also save it somwhere for weighted vote method

def showMethodForStyleAccuracy(method, style):
    total = len(data.loc[data['Original'] == style])
    m_data = (data.loc[data['Original'] == style])
    guessed = len(m_data.loc[m_data[method] == style])
    print(method, style, round((guessed+0.0)/total, 2))
    
styles = ['Aggro', 'Mid-Range', 'Control', 'Ramp']
methods = ['OneNeighbour', 'CardVote', 'CenterPoint', 'ParameterVote', 
           'KNN11', 'TagKNN11', 'TagCenterPoint']

for m in methods:
    for s in styles:
        showMethodForStyleAccuracy(m, s)
        
# To be used for Weighted method vote

# 3. Add Method Vote (all possible methods used, results 'vote' for final result, ties are broken by strongest method)
# 3A. Simple Method Vote (each vote has same wieght)
# 3B. Weighted Method Vote (each vote as weight related to accuracy for this archetype)
# 3C. Best Method Vote (vote from method that has highest accuracy for selected archetype)

('OneNeighbour', 'Aggro', 0.89)
('OneNeighbour', 'Mid-Range', 0.57)
('OneNeighbour', 'Control', 0.76)
('OneNeighbour', 'Ramp', 0.61)
('CardVote', 'Aggro', 0.89)
('CardVote', 'Mid-Range', 0.0)
('CardVote', 'Control', 0.88)
('CardVote', 'Ramp', 0.25)
('CenterPoint', 'Aggro', 0.81)
('CenterPoint', 'Mid-Range', 0.73)
('CenterPoint', 'Control', 0.68)
('CenterPoint', 'Ramp', 0.89)
('ParameterVote', 'Aggro', 0.76)
('ParameterVote', 'Mid-Range', 0.23)
('ParameterVote', 'Control', 0.78)
('ParameterVote', 'Ramp', 0.28)
('KNN11', 'Aggro', 0.88)
('KNN11', 'Mid-Range', 0.43)
('KNN11', 'Control', 0.82)
('KNN11', 'Ramp', 0.63)
('TagKNN11', 'Aggro', 0.84)
('TagKNN11', 'Mid-Range', 0.0)
('TagKNN11', 'Control', 0.88)
('TagKNN11', 'Ramp', 0.32)
('TagCenterPoint', 'Aggro', 0.71)
('TagCenterPoint', 'Mid-Range', 0.57)
('TagCenterPoint', 'Control', 0.66)
('TagCenterPoint', 'Ramp', 0.67)


In [153]:
# Visualise Clustering

import graphviz as gv
from graphviz import Source

def prepareRanksAndNodes(filename):
    my_file = open(filename, 'r')
    my_string = my_file.read()
    my_list = my_string.split('\n')
    ranks_str = ""
    nodes_str = ""

    ranks = {}
    
    for i in my_list:
        tmp = i.split('|')
        if len(tmp) == 5:
            nodes_str+='node'+tmp[0]+' [label= "'+tmp[1]+'", fillcolor ="'+tmp[3]+'"];\n'       
            if str(tmp[4]) in ranks.keys():
                ranks[str(tmp[4])].append(tmp[0])
            else:
                ranks[str(tmp[4])] = [tmp[0]]
                
    int_ranks = []
    for r in ranks.keys(): 
        int_ranks.append(int(r))
    int_ranks = sorted(int_ranks)
                
    for r in int_ranks:
        tmp = '{ranks=same;'
        for n in ranks[str(r)]:
            tmp+=' node'+n
        tmp+='}\n'
        ranks_str+=tmp
        
    return nodes_str, ranks_str

def prepareEdges(filename):
    my_file = open(filename, 'r')
    my_string = my_file.read()
    my_list = my_string.split('\n')
    
    edges_str = ''
    
    for i in my_list:
        tmp = i.split('|')
        if len(tmp) == 5:
            if tmp[2] == 'none':
                edges_str+='node'+tmp[0]+' -> node'+tmp[1]+' [label="'+tmp[3]+'", color="'+tmp[4]+'", dir=none];\n'

            if tmp[2] == 'arrow':
                edges_str+='node'+tmp[0]+' -> node'+tmp[1]+' [color="'+tmp[4]+'", dir=forward];\n'
    return edges_str
        
def prepareGraphSource(template_file, label, ranks_str, nodes_str, edges_str, output_file):
    my_file = open(template_file, 'r')
    template_string = my_file.read()
    my_file.close()
    
    graph_str = template_string.replace('[LABEL]', '"'+label+'"')
    graph_str = graph_str.replace('[RANKS]', ranks_str)
    graph_str = graph_str.replace('[NODES]', nodes_str)
    graph_str = graph_str.replace('[EDGES]', edges_str)
    
    out_file = open(output_file, 'w')
    out_file.write(graph_str)
    out_file.close()

nodes_str, ranks_str = prepareRanksAndNodes('tag_clustering_nodes.txt')
edges_str = prepareEdges('tag_clustering_edges.txt')

prepareGraphSource('graph_template.gv', 'Archetype Clustering - Tags', ranks_str, nodes_str, edges_str, 'output.gv')

s = Source.from_file('output.gv', format='svg')
s.view()


'output.gv.svg'