In diesem Notebook geschieht die eigentliche Arbeit. Verschiedene Localizer können hinzugefügt werden. Durch den Voter werden die einzelnen Localizer gewichtet. Die Resultate werden als game_patterns Objekte gespeichert (Konvertiert zu JSON). Zusätzlich werden Gephi Graphen gesichert. 

In [None]:
from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import Cluster
import os
import spacy
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

import time

%run ./REST.ipynb
%run ./GamePatterns.ipynb
%run ./nlpProcessedGame.ipynb

%run ./DistanceCalculator.ipynb
%run ./MatcherLocalizer.ipynb
%run ./RegexLocalizer.ipynb
%run ./GamePatternsVoter.ipynb
%run ./BayesLocalizer.ipynb
%run ./SVMLocalizer.ipynb
%matplotlib inline




## Parameter

In diesem Abschnitt werden Parameter für das gesamte Notebook eingestellt.

### Datenbank

In [None]:
db_username = 'itp_mining'
db_password = 'mining-data'
db_keySpace = 'itp_mining'
cluster_address = 'big1.informatik.fh-nuernberg.de'


query = "select * from working_text limit 5"
#query = "select * from working_text where  sourceuid = '6b05437860f54f19f6eca4b38825807c'" #Leere Anleitung

### Ausgabe

In [None]:
create_graph = True
export_graph = True
export_sentences = True

#Ausgabeverzeichnisse
object_output_directory = "Output/GamePattern"
object_filename_prefix = "GamePatterns-"

graph_output_directory = "Output/GamePatternGraphs"
graph_filename_prefix = "Graph-"

sentences_output_directory = "Output/Sentences"
sentences_filename_prefix = "Sentences-"

### Localizer Parameter

In [None]:
min_length = 3  #Mindest Satzlänge in Tokens. Kürzere Sätze werden beim Lokalisieren ignoriert.

#use_matcher_localizer = False
#pattern_terms_file_name = "pattern_terms.json"
#weight_matcher = 1

use_regex_localizer = True
pattern_expressions_file_name = "pattern_expressions.json"
weight_regex = 2

use_svm_localizer = True
svm_models_directory = "SVM_Models"
weight_svm = 1
svm_remove_stopwords = True

use_bayes_localizer = True
bayes_models_directory = "Bayes"
weight_bayes = 1
bayes_remove_stopwords = True
bayes_lemmatize = False



### Voter

    use_uniform:
        True: Demokratie der einzelnen Localizer mit einfacher Mehrheit. Übrige parameter werden ignoriert
        False: Verwendung der einzelnen Gewichte
    
    use_weight_Ratio: 
        True: Threshold proportional zur Summe der Gewichte. weight_threshold wird ignoriert
        False: Fester Wert für Threshold. weight_Ratio wird ignoriert
    
    weight_Ratio: 
        Nur bei use_weight_Ratio = True. Threshold als Anteil von Summe der Gewichte
    
    weight_threshold:
        Nur bei use_weight_Ratio = False. Fester Wert für Threshold

In [None]:
use_uniform = False
use_weight_Ratio = False
weight_Ratio = 0.5
weight_threshold = 2

### Abstandsberechnung<br>
Kommt nur bei *export_graph = True* zum Einsatz

In [None]:
distance_threshold = 0.3
#distance_fkt = distance_minimum
#distance_fkt = distance_average
#distance_fkt = distance_hausdorff
distance_fkt = distance_cuddle

### Graphische Darstellung

In [None]:
maximum_edge_size = 3
node_size=1000
node_color='blue'
edge_color='#303060A0'
font_size =20
alpha=0.5
title_size = 20

### Debug

In [None]:
verbose = True

## Initialisierung

REST Client

In [None]:

client = empamos_rest_client()
pattern_names = client.get_patterns_name_dictionary()

Ausgabeverzeichnis erstellen

In [None]:
os.makedirs(graph_output_directory, exist_ok=True)
os.makedirs(object_output_directory, exist_ok=True)
os.makedirs(sentences_output_directory, exist_ok=True)



Tokenizer Initialisieren

In [None]:
nlp = spacy.load("de")

Alle Arten von Pattern-Localisierern initialisieren<br><br>

An dieser Stelle können beliebige Lokalizer hinzugefügt werden.<br>
Es wird empfohlen einen Schalterparameter und einen Gewichtungsparameter einzuführen. Diese und weitere Parameter können im Abschnit Parameter unter Localizer Parameter verwaltet werden.<br>
Nach der Initialisierung müssen die Localizer in die localizer Liste eingetragen werden. Die Bezeichnung in die localizer_names Liste und das Gewicht in die weights Liste.<br><br>
Z.B.:<br>
```
localizers.append(myLocalizer)
localizers_names.append("My Localizer Name")
weights.append(weight_myLocalizer)
```
<br>

In [None]:
localizers = []
localizers_names = []  
weights = []

~~PatternMatcher-Lokalisierer~~

#Deprecated
if use_matcher_localizer:
    patloc = matcher_localizer()
    myFile = open(pattern_terms_file_name, "r")
    pattern_terms = myFile.read()
    pattern_terms = json.loads(pattern_terms)
    patloc.init_matcher(nlp,pattern_terms)
    localizers.append(patloc)
    localizers_names.append("Matcher Lower")
    weights.append(weight_matcher)

~~PatternMatcher-Lokalisierer basierend auf Lemmata~~

#Deprecated
if use_matcher_localizer:
    lemloc = matcher_localizer()
    myFile = open(pattern_terms_file_name, "r")
    pattern_terms = myFile.read()
    pattern_terms = json.loads(pattern_terms)
    lemloc.init_matcher_lemma(nlp,pattern_terms)
    localizers.append(lemloc)
    localizers_names.append("Matcher Lemma")
    weights.append(weight_matcher)

**Regex Lokalisierer**

In [None]:
if use_regex_localizer:
    regloc = regex_localizer()
    with open(pattern_expressions_file_name, "r") as myFile:
        pattern_expressions = myFile.read()
    pattern_expressions = json.loads(pattern_expressions)
    regloc.init(pattern_expressions,True)
    regloc.set_min_sentence_length(min_length)

    localizers.append(regloc)
    localizers_names.append("Regex")
    weights.append(weight_regex)

**Bayes Lokalisierer**

In [None]:

if use_bayes_localizer:
    bayloc = BayesLocalizer(bayes_remove_stopwords,bayes_lemmatize)
    bayloc.load_all_models(bayes_models_directory)


    localizers.append(bayloc)
    localizers_names.append("Bayes")
    weights.append(weight_bayes)

Bayes-Lokalisierer Handselected initailisieren

bayloc = BayesLocalizer() bayloc.load_all_models()
localizers.append(bayloc) localizers_names.append("Bayes") weights.append(100)

**SVM Lokalisierer**

In [None]:
if use_svm_localizer:
    svmloc = SVMLocalizer()
    svmloc.load_all_models(svm_models_directory)
    svmloc.remove_stopwords = svm_remove_stopwords
    
    localizers.append(svmloc)
    localizers_names.append("SVM")
    weights.append(weight_svm)

**Localizer hier einfügen**

Gegebenenfalls Zielgewicht berechnen

In [None]:
if use_weight_Ratio == True:
    weight_threshold = sum(weights) * weight_Ratio

Verbindung mit Datenbank herstellen

In [None]:

#cluster = Cluster(['172.17.0.3'])
#cluster = Cluster(['big1.informatik.fh-nuernberg.de'])
#session = cluster.connect('empamos')
#ap = PlainTextAuthProvider(username='itp_mining', password='mining-data')
#cluster = Cluster(['big1.informatik.fh-nuernberg.de'], auth_provider=ap)
#session = cluster.connect('itp_mining')

ap = PlainTextAuthProvider(username=db_username, password=db_password)
cluster = Cluster([cluster_address], auth_provider=ap)
session = cluster.connect(db_keySpace)

## Anleitungen verarbeiten

Gewünschte Anleitungen aus der Datenbank laden.

In [None]:
rows = session.execute(query)


#rows = session.execute("select * from working_text limit 4")

#rows = session.execute("select * from working_text where  sourceuid = '61352503f164d56a7d013f29cbe330fb'") #Hanabi
#rows = session.execute("select * from working_text where  sourceuid = 'ece99495e009ec6037ef86220d880436'") #Tzolkin
#rows = session.execute("select * from working_text where  sourceuid = '46a0e7d814e78b2978f6c703fefefabd'") #7Wonders
#rows = session.execute("select * from working_text where  sourceuid = '6b05437860f54f19f6eca4b38825807c'") #Bug!!




rows[0].text

*Debug*<br>
Dummy Klasse um Cassandra Resultsetz zu simulieren. Wird verwenden um eigene Texte für Testzwecken zu analysieren.<br>
Für den Produktivbetrieb auskommentieren!

class dummy_class:
    def dummy(self):
        print("Dummy")
rows = []
rows.append(dummy_class())
rows[0].sourceuid = '12345'
#rows[0].text =  "Siegpunkt! Punkt! Siegpunkt! Dies ist ein toller Test. Siegpunkte! Ganz toller Test. Punkte werden gepunktet. Keine Siegpunkte. Nimm den Wertungsblock. Das ein wertungsblock. Das Wetter ist Schön. EMPAMOS RULZ HARDCORE Bestes IT Projekt evar!"
rows[0].text =  "Jeder Spieler stellt seinen Meeple auf sein Startfeld. Der rote Spieler erhält 5 Eisen. Der blaue Spieler erhält 2 Holz und 2 Getreide. Jeder nimmt sich zwei Aktionskarten. Alle Spieler stellen ihre Superkühe auf die jeweiligen Anfangsfelder. OCRMÜLL. OCRSUPERMÜLL. Der Blaue Spieler beginnt."

rows[0].text

Alle Anleitungen verarbeiten

Je Anleitung:<br>
-Anleitung tokenisieren<br>
-Tokenisierte Anleitung mit allen Pattern-Lokalisierern durchsuchen<br>
-Dabei entstandene GamePattern Objekte mittels Voter in ein einzelnes GamePattern Objekt überführen<br>
-Abstände zwischen den einzelnen Patterns berechnen<br>

In [None]:
all_games_patterns = []
start = time.time()
game_count = 0
for row in rows:
    #if(len(row.text)==0):
        #continue
        
    print("Processing game manual " + str(row.sourceuid))
    game_count += 1
    
    processed_game = nlp_processed_game()
    processed_game.set_ID(row.sourceuid)
    processed_game.set_spacy_doc(nlp(row.text))
    #print(row.text)
    current_game_patterns = []
    cnt = 0
    for localizer in localizers:
        
        gp = localizer.read_nlp_processed_game_to_game_patterns(processed_game )   
        current_game_patterns.append(gp)
        current_name = "undefined"
        if cnt < len(localizers_names):
            current_name = localizers_names[cnt]
        cnt += 1
        if verbose:
            print(current_name)
            gp.print_verbose(pattern_names)
        
    if use_uniform == True:
        merged_game_patterns = merge_game_patterns_uniform(current_game_patterns)
    else:
        merged_game_patterns = merge_game_patterns(current_game_patterns, weights, weight_threshold)
    
    if verbose:
        print()
        print("Voter result")
        merged_game_patterns.print_verbose(pattern_names)
    
        print("Minimum")
        print(calculate_distance(merged_game_patterns.patterns,distance_minimum))
        print("")
        print("Average")
        print(calculate_distance(merged_game_patterns.patterns,distance_average))
        print("")
        print("Hausdorff")
        print(calculate_distance(merged_game_patterns.patterns,distance_hausdorff))
        print("")
        print("Cuddle")
        print(calculate_distance(merged_game_patterns.patterns,distance_cuddle))
        print("")
        print("")
        
    if len(merged_game_patterns.patterns) == 0:
        print("no pattern found in this game")
        #continue
    
    all_games_patterns.append(merged_game_patterns)
    with open(object_output_directory+"/"+ object_filename_prefix+ str(row.sourceuid)+".json","w") as f:
        f.write(merged_game_patterns.toJSON())
        
    if export_sentences:
        with open(sentences_output_directory+"/"+ sentences_filename_prefix + str(row.sourceuid)+".txt","w") as f:
            f.write(merged_game_patterns.get_sentences_with_patterns(pattern_names))
    
        
    dists = calculate_distance(merged_game_patterns.patterns,distance_fkt)
    print(dists)
    
    if not create_graph:
        continue
        
    # Build your graph
    df = pd.DataFrame(columns=['source', 'target', 'value'])
    for i in range(0,len(merged_game_patterns.patterns)):
        for j in range (i,len(merged_game_patterns.patterns)):
            #print(str(list(merged_game_patterns.patterns.keys())[i]) + "-" + str(list(merged_game_patterns.patterns.keys())[j]))
            d = dists[list(merged_game_patterns.patterns.keys())[i]][list(merged_game_patterns.patterns.keys())[j]] 
            if d <= distance_threshold or i ==j:    
                value = maximum_edge_size * (distance_threshold +0.01 - d) / (distance_threshold +0.01)
                s = list(merged_game_patterns.patterns.keys())[i]
                t = list(merged_game_patterns.patterns.keys())[j]
                new_row = {'source':str(s)+":"+pattern_names[s], 'target':str(t)+":"+pattern_names[t], 'value':value}
                df = df.append(new_row, ignore_index=True)
    # Build your graph
    G = nx.from_pandas_edgelist(df, 
                                source='source',
                                target='target',
                                edge_attr=True)
    
    #Write the Graph in a file
    if export_graph:
        nx.write_graphml(G,graph_output_directory+"/"+ graph_filename_prefix + str(row.sourceuid) + ".graphml")
        
    #pos = nx.kamada_kawai_layout(G, weight='value')
    pos = nx.circular_layout(G)
    _ = plt.figure(figsize=(20, 20))
    nx.draw(G, pos, 
            node_size=node_size, 
            node_color=node_color,
            font_size =font_size,
            alpha=alpha,
            with_labels = True)
    plt.title('Graph Visualization for ' + str(merged_game_patterns.get_ID()), size=title_size)

    for (node1,node2,data) in G.edges(data=True):
        width = data['value'] 
        _ = nx.draw_networkx_edges(G,pos,
                                   edgelist=[(node1, node2)],
                                   width=width,
                                   edge_color=edge_color,
                                   alpha=0.5)

    plt.show()
    

In [None]:

end = time.time()
print("Processed " + str(game_count) + " games.")
print("Elapsed Time: " + str(end - start))

print(all_games_patterns[0].toJSON())