In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import string
from matplotlib import pyplot as plt
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split

In [None]:
def plot_scores(train) :
    accuracy = train.history['accuracy']
    val_accuracy = train.history['val_accuracy']
    epochs = range(len(accuracy))
    plt.plot(epochs, accuracy, 'b', label='Score apprentissage')
    plt.plot(epochs, val_accuracy, 'r', label='Score validation')
    plt.title('Scores')
    plt.legend()
    plt.show()

In [None]:
df = pd.read_csv('generated.csv', ';')
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df.label.value_counts()

In [None]:
X = df.text
y = df.label

In [None]:
def preprocessing(X,y = None,max_words = 200):
    with open('../txt-classification/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    X_ = []
    for i, sentence in enumerate(X):
        tmp_sentence = sentence.lower()
        tmp_sentence = tmp_sentence.replace('\n', '')
        tmp_sentence = tmp_sentence.translate(translator)
        X_.append(tmp_sentence)
    X = X_.copy()
    
    X = tokenizer.texts_to_sequences(X)
    X = sequence.pad_sequences(X, maxlen=max_words, padding='post')

    if y is not None:
        y = y.map({'address': 0, 'contact': 1, 'other': 2})
        y = to_categorical(y)val_acccuracy
        return X

In [None]:
X,y = preprocessing(X,y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Conv1D, MaxPool1D, Flatten, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
input = Input(shape=(200,))
embed = Embedding(500, 200, input_length=200)(input)

conv1 = Conv1D(64,1,activation='relu')(embed)
drop1 = Dropout(0.2)(conv1)
maxpool1 = MaxPool1D()(drop1)

conv2 = Conv1D(64,5,activation='relu', padding="same")(embed)
drop2 = Dropout(0.2)(conv2)
maxpool2 = MaxPool1D()(drop2)

concat = Concatenate()([maxpool1, maxpool2])

conv3 = Conv1D(64, 3, activation='relu')(concat)
drop3 = MaxPool1D()(conv3)
conv4 = Conv1D(32, 3, activation='relu')(drop3)
conv5 = Conv1D(8, 3, activation='relu')(conv4)

flatten = Flatten()(conv5)

dense = Dense(3, activation='softmax')(flatten)

model = Model(input, dense)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
plot_model(model, show_shapes=True)

In [None]:
es = EarlyStopping("val_accuracy", 0.001, 70, mode='max', restore_best_weights=True)

train = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
    epochs=700, batch_size=128, callbacks=[es], verbose=2)

In [None]:
plot_scores(train)

In [None]:
model.save('weights/0_addr_identification.h5')