# Artificial Chris

The goal is to create a chatbot, given a variable length input sequence to output a variable length output that sounds like Chris is writing it.

In [1]:
import numpy as np
import pandas as pd

import os
import re
import glob

!pip install emoji
import emoji

# keras with tensorflow backend
import tensorflow as tf
import keras
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Flatten # , CuDNNLSTM
from keras.optimizers import RMSprop
from keras.utils import to_categorical, np_utils
from keras.callbacks import EarlyStopping

from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier



Using TensorFlow backend.


## Load Data from Txt

In [2]:
data_folder = "data"
txt_files = glob.glob(os.path.join(data_folder, '*.txt'))

In [3]:
def clean_name(name):
    corr = re.sub('[^A-Za-z0-9öüäéè ]+', '', name)
    if (corr[-1] == ' '):
        corr = corr[:-1]
    return corr

In [4]:
def get_df_from_txt_file(file_name):
    f = open(file_name)
    file_name = os.path.split(file_name)[1]
    
    date = []
    time = []
    sender = []
    reciever = []
    message = []

    partner_name = file_name[18:-4] + ": "
    chris_name = 'Christoph Bernkopf: '

    for line in f:
        if(len(line) < 2 or "Nachrichten in diesem Chat sowie Anrufe" in line):
    #         print("skip line")
            continue
        if(line[15:18] == " - " and line[2] == "." and line[5] == "."):
            date.append(line[:8])
            time.append(line[10:15])
            if (chris_name in line[18:]):
                sender.append(clean_name(chris_name[:-2]))
                reciever.append(clean_name(partner_name[:-2]))
            else:
                reciever.append(clean_name(chris_name[:-2]))
                sender.append(clean_name(partner_name[:-2]))
            message.append(line[18:].replace(partner_name, '').replace(chris_name, ''))

    df = pd.DataFrame(date, columns=['date'])
    df['time'] = time
    df['sender'] = sender
    df['reciever'] = reciever
    df['message'] = message

    return df

In [5]:
try:
    del df
except:
    None

for file_name in txt_files:
    print(file_name)
    curr_df = get_df_from_txt_file(file_name)
    if ('df' in locals()):
        df = pd.concat([df, curr_df])
    else:
        df = curr_df
    df = df.reset_index(drop=True)
    print(curr_df.shape, df.shape)

data/WhatsApp Chat mit Lukas ⛵.txt
(1958, 5) (1958, 5)
data/WhatsApp Chat mit Max Frèremano 😏💚🍁.txt
(11562, 5) (13520, 5)
data/WhatsApp Chat mit Megan ).txt
(8546, 5) (22066, 5)
data/WhatsApp Chat mit Daniel TPH Rugby.txt
(2659, 5) (24725, 5)
data/WhatsApp Chat mit Mama 😷👀.txt
(880, 5) (25605, 5)
data/WhatsApp Chat mit Daniel TPH Rideclub Marik.txt
(328, 5) (25933, 5)
data/WhatsApp Chat mit Milica Modeschule.txt
(1941, 5) (27874, 5)
data/WhatsApp Chat mit Max Maxi Frühschütz Fabian.txt
(2807, 5) (30681, 5)
data/WhatsApp Chat mit Christoph Gerhardus ).txt
(3823, 5) (34504, 5)
data/WhatsApp Chat mit Alexander Götz.txt
(1186, 5) (35690, 5)
data/WhatsApp Chat mit Fabian Fürst.txt
(25829, 5) (61519, 5)
data/WhatsApp Chat mit Martin TPH CERN 3.txt
(5326, 5) (66845, 5)


## Aggregate Messages into Inputs and Targets

In [6]:
remove_list = ['<Medien ausgeschlossen>', '\n', 'Martin TPH CERN <3: ', 'Christoph Gerhardus :): ', 'Megan :): ', 'Standort: ']

for item in remove_list:
    df['message'] = df['message'].apply(lambda x: x.replace(item, ''))

# df['message'] = df['message'][df['message'].astype('str').str[:10] != "Standort: "]

# get rid of links
df['message'] = df['message'].apply(lambda x: re.sub(r'http\S+', '', x))

# get rid of nan messages
df['message'] = df['message'].apply(lambda x : x if type(x) == str else "")

df = df[df['message'] != ""]

In [7]:
def func(x):
    if (type(x) != str):
        print(x)
        
df['message'].apply(lambda x: func(x));

In [8]:
def char_is_emoji(character):
    return character in emoji.UNICODE_EMOJI

def char_is_spe_char(character):
    return character in "?!."

def text_has_emoji(text):
    for character in text:
        if character in emoji.UNICODE_EMOJI:
            return True
    return False

def text_has_spe_char(text):
    for character in text:
        if character in "?!.":
            return True
    return False

In [9]:
def space_emoijis(x):
    if (text_has_emoji(x) or text_has_spe_char(x)):
        result = ''
        if (len(x) == 1):
            return result
        for ch in x:
            if (char_is_emoji(ch)):
                result = result + ' ' + ch
            elif (char_is_spe_char(ch)):
                result = result + ' ' + ch
            else:
                result = result + ch
        return result
    else:
        return x

In [10]:
df['message'] = df['message'].apply(lambda x: space_emoijis(x))

In [11]:
df['sender'].unique()

array(['Lukas', 'Christoph Bernkopf', 'Max Frèremano', 'Megan',
       'Daniel TPH Rugby', 'Mama', 'Daniel TPH Rideclub Marik',
       'Milica Modeschule', 'Max Maxi Frühschütz Fabian',
       'Christoph Gerhardus', 'Alexander Götz', 'Fabian Fürst',
       'Martin TPH CERN 3'], dtype=object)

In [12]:
def get_message(i,j):
    msg = df.iloc[i:j,df.columns.get_loc('message')].values
    assert msg.shape[0] > 0

    if (msg.shape[0] > 1):
        inpu = msg[0]
        for m in msg[1:]:
            inpu = "%s \n %s" % (inpu, m)
    else:
        inpu = msg[0]
        
    return inpu

In [13]:
def get_agg_message(i, name):
    col = df.columns.get_loc('reciever')
    
    for j in range(1000):
        j = i + j
        if (name == df.iloc[j,col]):
            break    
    inpu = get_message(i,j)
        
    for k in range(1000):
        k = j + k
        if (name != df.iloc[k,col]):
            break
    target = get_message(j,k)
    
    return inpu, target, k-1

In [14]:
inp = []
tar = []
sen = []
rec = []

i = 0
while i<(len(df)-10):
# while i<15:
    # get input & target
    col = df.columns.get_loc('sender')
    name = df.iloc[i,col]
    next_names = df.iloc[i+1:i+10,col].values
    if (name != "Christoph Bernkopf" and name in next_names):
        inpu, target, i = get_agg_message(i, name)    
        inp.append(inpu)
        tar.append(target)
        sen.append(name)
        rec.append("Christoph Bernkopf")
    i += 1

In [15]:
dfn = pd.DataFrame(inp, columns=['input'])
dfn['target'] = tar
dfn['sender'] = sen
dfn['reciever'] = rec

## A little bit of filtering

In [16]:
dfn = dfn[dfn['input'].astype('str').str.len()>20]

In [17]:
dfn.head()

Unnamed: 0,input,target,sender,reciever
0,Was gibts ? Kann heut leider nicht,Was machst heute ?,Lukas,Christoph Bernkopf
1,Lernen \n Und bin bei einer maturafeier,Leinwand \n Leiwand \n Wann hast die letzte Pr...,Lukas,Christoph Bernkopf
3,Dreier ! \n Und 10ects ^^,Leiwand ! \n Wurde das so schnell kontrolliert ?,Lukas,Christoph Bernkopf
5,Kann ich mir vorstellen \n Alles gut gelaufen ?,Ja bis jetzt schon einigermaßen \n Aber ich ha...,Lukas,Christoph Bernkopf
9,"Dr . Roberts \n ^^ \n , Gfr \n Lol","Dr . Broberts \n , Gfr 😂 😂 😂 😂 😂",Lukas,Christoph Bernkopf


In [18]:
dfn.to_pickle(os.path.join(data_folder, "preprocessed_data.pkl"))

# Test: classify input by sender

In [44]:
X = dfn['input'].values
y = dfn['sender'].values
le = LabelEncoder()
y = le.fit_transform(y)
# y = y.reshape(-1,1)
le.classes_

array(['Alexander Götz', 'Christoph Gerhardus',
       'Daniel TPH Rideclub Marik', 'Daniel TPH Rugby', 'Fabian Fürst',
       'Lukas', 'Mama', 'Martin TPH CERN 3', 'Max Frèremano',
       'Max Maxi Frühschütz Fabian', 'Megan', 'Milica Modeschule'],
      dtype=object)

In [45]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size=0.1)

In [46]:
X_train[:5]

array(['Würd jz auch ur viel Zeit kosten',
       'Doch \n Hatte Handy auf Flugmodus',
       'Alright \n Das war ernst gemeint, du bist auf einem Keks gesessen und hat’s Krümel am arsch !',
       'Kein Problem ! ! \n Sie sagen sie sind dir nicht böse weil sie happy sind dass du so brav auf mich aufpasst  😘 \n Btw meine Großmutter aus Frankreich hat dich im Juli ein Wochenende nach Frankreich eingeladen, you should come wenn du zeit hast:) \n (Ihr Fleischhauer ist skydiver und wir wurden schon angekündigt haha)',
       'Verpasster Sprachanruf'], dtype=object)

In [47]:
max_words = 20000
max_len = 200
tok = Tokenizer(num_words=max_words, filters='"#$%&()*+,-./;<=>@[\]^_`{|}~ ')
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
train_sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

sequences = tok.texts_to_sequences(X_val)
val_sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

## Baselines

In [48]:
dummy_classifier = DummyClassifier(strategy="most_frequent")
dummy_classifier.fit(train_sequences_matrix, y_train)
acc_dummy_classifier = dummy_classifier.score(val_sequences_matrix, y_val)
acc_dummy_classifier

0.3615071283095723

In [49]:
u, indices = np.unique(y_val, return_inverse=True)
most_freq_y_val = u[np.argmax(np.bincount(indices))]
le.inverse_transform(most_freq_y_val)

  if diff:


'Fabian Fürst'

In [50]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(train_sequences_matrix, y_train)
tree_score = tree.score(val_sequences_matrix, y_val)
tree_score

0.2556008146639511

## Classification RNN

https://www.kaggle.com/kredy10/simple-lstm-for-text-classification

In [51]:
y_train = np_utils.to_categorical(y_train)
y_val = np_utils.to_categorical(y_val)
y_test = np_utils.to_categorical(y_test)

In [52]:
class Attention(keras.layers.Layer):

    def __init__(self, key_dim=None, **kwargs):
        self.key_dim = key_dim
        super(Attention, self).__init__(**kwargs)

        
    def build(self, input_shape):
         # Weights initializer function
        w_initializer = keras.initializers.glorot_uniform()

        # Biases initializer function
        b_initializer = keras.initializers.Zeros()
        
        #Matrix to extract the keys
        self.key_extract = self.add_weight(name='feature_extract', 
                                      shape=(int(input_shape[2]),int(self.key_dim)),
                                      initializer=w_initializer,
                                      trainable=True)
        #Key Bias
        self.key_bias = self.add_weight(name='feaure_bias', 
                                      shape=(int(1),int(self.key_dim)),
                                      initializer=b_initializer,
                                      trainable=True)
        
        #The Query representing the class
        self.Query = self.add_weight(name='Query', 
                                      shape=(int(self.key_dim),int(1)),
                                      initializer=w_initializer,
                                      trainable=True)

        super(Attention, self).build(input_shape) 


    def call(self, x):
        #Extract the Keys
        keys=tf.tensordot(x,self.key_extract,axes=[2,0])+self.key_bias
        
        #Calculate the similarity between keys and the Query
        similar_logits=tf.tensordot(keys,self.Query,axes=[2,0])
        
        #Normalize it to be between 0 and 1 and sum to 1
        attention_weights = tf.nn.sigmoid(similar_logits)
        
        #Use these Weights to aggregate
        weighted_input = tf.matmul(x, attention_weights, transpose_a=True)

        return [weighted_input, attention_weights]


    def compute_output_shape(self, input_shape):
        return [(input_shape[0],input_shape[2],int(1)), (input_shape[0],input_shape[1],1)]
    
    
    def get_config(self):
        base_config = super(Attention, self).get_config()
        base_config['key_dim'] = self.key_dim
        return base_config

In [53]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
#     layer = CuDNNLSTM(64, return_sequences=True)(layer)
    layer = LSTM(64, return_sequences=True)(layer)
    layer, attention_weights = Attention(256)(layer)
    layer = Flatten()(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(len(le.classes_),name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [54]:
model = RNN()
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='adam', # RMSprop(),
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 200)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 200, 50)           1000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 200, 64)           29440     
_________________________________________________________________
attention_3 (Attention)      [(None, 64, 1), (None, 20 16896     
_________________________________________________________________
flatten_3 (Flatten)          (None, 64)                0         
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_5 (Activation)    (None, 256)               0         
__________

In [55]:
model.fit(train_sequences_matrix,
          y_train,
          batch_size=128,
          epochs=10,
          validation_data=(val_sequences_matrix, y_val),
          callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)]
         )

Train on 8831 samples, validate on 982 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.History at 0x1a2c5402e8>

In [56]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [57]:
accr = model.evaluate(test_sequences_matrix,y_test)



In [58]:
print("test accuracy %f" % accr[1])
# not bad

test accuracy 0.611366
