__<font size="6" weight="bold">SVM Localizer</font>__

Diese Klasse beinhaltet eine Liste mit den trainierten Modellen für die einzelnen Pattern. <br>
Bei der Initialisierung werden alle Modelle neu erstellt oder geladen, falls sie bereits vorhanden sind <br>

## Einbinden der Bibliotheken

In [1]:
import spacy
import pickle
import os 
import os.path as path
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

## Rest Client erstellen

In [2]:
%run ./REST.ipynb
client = empamos_rest_client()

## Game_Pattern Klasse laden

In [3]:
%run ./GamePatterns.ipynb

## nlp_processed_game Klasse laden

In [4]:
%run ./nlpProcessedGame.ipynb

## Klasse SVMLocalizer

__Parameter:__ <br>
__directoryname__: Name des Verzeichnis der Modelle <br>
__minSentenceLength__: Mindest Anzahl der Wörter / Satz <br>
__trueValue__: Gibt an welchen Wert der Klassifikator ausgibt, wenn ein Satz das gesuchte Pattern enthält (0 bei Matthias Kriegbaum Modellen | 1 bei unseren Modellen)  <br>
__remove_stopwords__: Gibt an ob Stopwörter entfernt werden soll (True = ja | False = nein)


__Funktionen:__

__ctor__: Initialisieren <br>
__load_all_models(filepath)__: Läd alle trainierten Modelle. Filepath ist optional. Ist keiner angegeben, werden die Modelle aus dem Verzeichnis mit dem Namen directoryname geladen <br>
__load_model(id, filepath)__: Läd trainiertes Model über id und filepath. Filepath ist optional <br>
__train(id, dataframe)__: Trainiert Model mit übergebenem Dataframe und speichert unter der angegebenen id <br>
__read_nlp_processed_game_to_game_patterns(processedManual)__: NLP-verarbeitete Anleitung als Übergabeparamter. Gibt Game_Pattern für Anleitung zurück.

In [8]:
class SVMLocalizer: 
    modelDictionary = {}
    directoryname="SVM_Models"
    minSentenceLength = 3
    trueValue = 1
    remove_stopwords = True
    
    def __init__(self):
        self.modelDictionary = {}
                       
    
    def load_all_models(self, filepath = ''):
        if filepath == '':
            filepath = f'{os.getcwd()}/{self.directoryname}'
        
        for file in os.listdir(filepath):
            if file.endswith(".pkl"):
                pattern_id = str(file).replace('Pattern-', '').replace('.pkl', '')
                self.load_model(pattern_id, filepath)
                   
    
    def load_model(self, id, filepath):
        file = self.get_filename(filepath, id)
        
        if path.exists(file):
            try:
                self.modelDictionary[id] = load(file) 
            except:
                print('Fehler beim Dateizugriff. Prüfen Sie die Eigenschaft <directoryName>! ')
        
    def train(self, id, dataframe):
        folder = f'{os.getcwd()}/{self.directoryname}'
        
        file = self.get_filename(folder, id)
        os.makedirs(self.directoryname,exist_ok=True)
        os.makedirs(self.directoryname+"/SVM_Output", exist_ok=True)
        
        X = dataframe[0]
        y = dataframe[1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 42)
        
        ## SGDClassifier() zum erstellen des Models
        #sgd = Pipeline([('vect', CountVectorizer()),
        #('tfidf', TfidfTransformer()),
        #('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
        #])
        #
        #sgd.fit(X_train, y_train)
        
        lsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC(random_state=None)),
               ])

        parameters = {
             "vect__ngram_range": [(1, 1), (1, 2), (1, 3), (1, 4)],
             "tfidf__use_idf": (True, False),
             "clf__loss" : ['hinge', 'squared_hinge'],
             "clf__tol" : [1e-6, 1e-5, 1e-4]
        }

        ##Hier zu betrachtendes Attribut über scoring = <attribute> wählen
        gs_clf = GridSearchCV(lsvc, parameters, cv=5, n_jobs=-1, scoring = 'accuracy')

        gs_clf = gs_clf.fit(X_train, y_train)
        
        predictions = gs_clf.predict(X_test)
        
        
        f = open(self.directoryname+"/SVM_Output/Precision_Accuracy_Recall_F1_"+str(id)+".txt","w")
        f.write("Accuracy score"+ str (accuracy_score(y_test, predictions))+"\n")
        f.write('Precision score: ' + str (precision_score(y_test, predictions,pos_label=0))+"\n")
        f.write('Recall score: ' + str (recall_score(y_test, predictions,pos_label=0))+"\n")
        f.write('f1_score score: ' + str (f1_score(y_test, predictions,pos_label=0))+"\n")
        f.close()
        
        dump(gs_clf, open(file, 'wb'))
    
        
    def create_sentence_index_list(self, prediction, sentences):
        sentenceIndexList = [] 
        count = 0
        for i in prediction:
            if i == self.trueValue and len(sentences[count].split()) >= self.minSentenceLength:
                sentenceIndexList.append(count)
        
            count += 1
        
        return sentenceIndexList
    
    
    def get_filename(self, filepath, id):
        filename = f'/Pattern-{str(id)}.pkl'
        return filepath + "/" + filename
    
    
    def remove_stopwords(self, sentence):
        if(os.path.isfile("Stopwords/Stopword.txt")):
            STOPWORDS = set(line.strip() for line in open('Stopwords/Stopwords.txt'))
        else:
            STOPWORDS = set(stopwords.words('german'))
            
        sentence =  [word.lower() for word in sentence.split() if not word.lower() in STOPWORDS] 
        sentence_clean =""
            
        for word in sentence:
            sentence_clean+=word+" "
        return sentence_clean
        
    
    def read_nlp_processed_game_to_game_patterns(self, processedManual):
        doc=processedManual.get_spacy_doc()
        gamePattern = game_patterns()
        gamePattern.set_ID(processedManual.get_ID())

        sentenceTexts = []
        for sent in doc.sents:
            sentenceTexts.append(str(sent))
            
        gamePattern.set_sentences(sentenceTexts)
        
        listLemmaSentences = list(str(sent.lemma_) for sent in doc.sents)
        
        if(len(listLemmaSentences) == 0):
            return gamePattern
        
        if self.remove_stopwords:
            listLemmaSentences = [self.remove_stopwords(sent) for sent in listLemmaSentences]

        for key in self.modelDictionary:
            model = self.modelDictionary.get(key)
            if model:
                prediction = model.predict(listLemmaSentences)
                indexedSentences = self.create_sentence_index_list(prediction, listLemmaSentences)
                gamePattern.add_pattern(int(key),indexedSentences)
                
        return gamePattern
                
            