In [20]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.layers import Input, Embedding, Dense, Flatten
from keras.layers import LSTM, Dropout
from keras.models import Model
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import accuracy_score


In [21]:
def save_pickle(filename, objeto):
    outfile = open(filename,'wb')
    pickle.dump(objeto,outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename,'rb')
    objeto = pickle.load(infile)
    infile.close()
    return objeto

def create_model(vocabulary_size, seq_len):  
    model = Sequential()
    model.add(Input(shape=(maxlen,)))
    model.add(Embedding(vocabulary_size, seq_len, weights=[embedding_weights], input_length=maxlen))
    model.add(LSTM(64, return_sequences=True, activation='relu'))
    #model.add(LSTM(64, return_sequences=True, activation='relu'))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    #model.add(Dense(256, activation='relu'))
    #model.add(Dropout(0.5))
    model.add(Dense(num_of_classes,activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])    
    return model

In [22]:
maxlen = 25

In [23]:
base = load_pickle('Worter.p')

#Extract words and articles
artikels = []
worter = []

for key, wort in base.items():
    artikels.append(wort['Gender'])
    worter.append(wort['ORTH'])
    
df = pd.DataFrame({'artikel': artikels,'wort': worter})
df

Unnamed: 0,artikel,wort
0,Die,Zeit
1,Der,Man
2,Die,Hand
3,Die,Tag
4,Der,Weg
...,...,...
2605,Die,Haupt
2606,Das,Erlebnis
2607,Der,Datensatz
2608,Das,Geheimnis


In [24]:
#Cleaning Dataset
df = df.loc[(df.loc[:,'artikel'] == 'Der') | 
            (df.loc[:,'artikel'] == 'Die') |
            (df.loc[:,'artikel'] == 'Das'), :] 

df['wort'] = df['wort'].str.lower()
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wort'] = df['wort'].str.lower()


Unnamed: 0,artikel,wort
0,Die,zeit
1,Der,man
2,Die,hand
3,Die,tag
4,Der,weg
...,...,...
2605,Die,haupt
2606,Das,erlebnis
2607,Der,datensatz
2608,Das,geheimnis


In [25]:
#Split train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

train_texts = train_df['wort'].values 
test_texts = test_df['wort'].values 

In [26]:
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)
print(tk.word_index)

# Convert string to index 
train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

# Padding
train_data = pad_sequences(train_sequences, maxlen=maxlen, padding='pre')
test_data = pad_sequences(test_texts, maxlen=maxlen, padding='pre')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

{'UNK': 1, 'e': 2, 'n': 3, 'r': 4, 't': 5, 'a': 6, 'i': 7, 's': 8, 'l': 9, 'h': 10, 'u': 11, 'g': 12, 'o': 13, 'c': 14, 'k': 15, 'm': 16, 'f': 17, 'b': 18, 'd': 19, 'p': 20, 'z': 21, 'w': 22, 'v': 23, 'ü': 24, 'ä': 25, 'ö': 26, 'j': 27, 'y': 28, 'ß': 29, 'x': 30, 'q': 31}


In [27]:
train_classes = train_df['artikel'].values
test_classes = test_df['artikel'].values

le = LabelEncoder()
le = le.fit(df['artikel'])
le.classes_

train_classes = le.transform(train_classes)
test_classes = le.transform(test_classes)

train_classes = to_categorical(train_classes)
test_classes = to_categorical(test_classes)

In [28]:
vocab_size = len(tk.word_index)

#Setar onehot para cada letra
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))
for char, i in tk.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

# parameter 
embedding_size = 31
num_of_classes = 3



es = EarlyStopping(monitor= 'loss', patience = 10, verbose = 1, restore_best_weights=True)
rlr = ReduceLROnPlateau(monitor='loss', factor= 0.1, patience= 3, verbose=1)

model = create_model(vocab_size+1, embedding_size)
model.summary()

#Treinamento do modelo
history = model.fit(train_data, train_classes,
                    validation_data=(test_data, test_classes),
                    batch_size=32,
                    epochs=30,
                    verbose=1,
                    callbacks=[es, rlr])

#model.load_weights('pesos.h5')

accuracy = model.evaluate(test_data,test_classes)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accuracy[0],accuracy[1]))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 31)            992       
                                                                 
 lstm_1 (LSTM)               (None, 25, 64)            24576     
                                                                 
 flatten_1 (Flatten)         (None, 1600)              0         
                                                                 
 dense_2 (Dense)             (None, 256)               409856    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 3)                 771       
                                                                 
Total params: 436195 (1.66 MB)
Trainable params: 43619

In [29]:
predictions = model.predict(test_data)
predictions = [np.argmax(x) for x in predictions]
predictions = le.inverse_transform(predictions)
results = pd.DataFrame({'Wort': test_df['wort'].values, 'Real Artikel': test_df['artikel'].values, 'Predicted Artikel': predictions})
results



Unnamed: 0,Wort,Real Artikel,Predicted Artikel
0,schal,Der,Das
1,iris,Die,Die
2,gegenleistung,Die,Die
3,zeitung,Die,Die
4,karriere,Die,Die
...,...,...,...
513,familie,Die,Die
514,kran,Der,Der
515,ausrüstung,Die,Die
516,ende,Das,Die


In [30]:
results_der = results.loc[results['Real Artikel'] == 'Der']
results_die = results.loc[results['Real Artikel'] == 'Die']
results_das = results.loc[results['Real Artikel'] == 'Das']

In [31]:
accuracy_der = accuracy_score(results_der['Real Artikel'], results_der['Predicted Artikel'])
accuracy_der


0.6235955056179775

In [32]:
accuracy_die = accuracy_score(results_die['Real Artikel'], results_die['Predicted Artikel'])
accuracy_die

0.7370689655172413

In [33]:
accuracy_das = accuracy_score(results_das['Real Artikel'], results_das['Predicted Artikel'])
accuracy_das

0.42592592592592593

In [34]:
def custom_ending(wort_ending):
    result_textpart = results.loc[results['Wort'].str.endswith(wort_ending)]
    accuracy_textpart = accuracy_score(result_textpart['Real Artikel'], result_textpart['Predicted Artikel'])
    print(f'Accuracy: "{wort_ending}": {round(accuracy_textpart,2)}')

custom_ending('er') #Der
custom_ending('en') #Der
custom_ending('keit') #Die
custom_ending('heit') #Die
custom_ending('tät') #Die
custom_ending('e') #Die
custom_ending('chen') #Das
custom_ending('a') #Das

Accuracy: "er": 0.62
Accuracy: "en": 0.46
Accuracy: "keit": 1.0
Accuracy: "heit": 0.75
Accuracy: "tät": 1.0
Accuracy: "e": 0.77
Accuracy: "chen": 0.75
Accuracy: "a": 0.8
