In [1]:
import time
from datetime import datetime
import numpy as np
import pandas as pd
from pickle import dump, load
import joblib
import string
from nltk.corpus import stopwords
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer

#### Load Data

In [2]:
df = pd.read_csv("dataset\processed_df_V2.csv", sep='\t')
display(df)

Unnamed: 0,sentence_1,sentence_2
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
103726,I saw a huge cockroach outside my house today....,I live in Texas to so i know those feels
103727,"I have a big test on Monday. I am so nervous, ...","I have a big test on Monday, I am so nervous."
103728,"I have a big test on Monday. I am so nervous, ...",What is the test on?
103729,"I have a big test on Monday. I am so nervous, ...",It's for my Chemistry class. I haven't slept m...


#### Create the pipeline

In [3]:
def cleaner(x):
    return [a for a in (''.join([a for a in x if a not in string.punctuation])).lower().split()]

In [4]:
df

Unnamed: 0,sentence_1,sentence_2
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
103726,I saw a huge cockroach outside my house today....,I live in Texas to so i know those feels
103727,"I have a big test on Monday. I am so nervous, ...","I have a big test on Monday, I am so nervous."
103728,"I have a big test on Monday. I am so nervous, ...",What is the test on?
103729,"I have a big test on Monday. I am so nervous, ...",It's for my Chemistry class. I haven't slept m...


DecisionTree - Chatbot

In [5]:
pipeline = Pipeline([
                     ('bow',CountVectorizer(analyzer=cleaner)),
                     ('tfidf',TfidfTransformer()),
                     ('classifier',DecisionTreeClassifier())
                    ])

In [None]:
start = time.time
pipeline.fit(df['sentence_1'], df['sentence_2'])
end = time.time() - start
print("Training time: {:s}".format(datetime.utcfromtimestamp(end).strftime('%H:%M:%S')))

In [7]:
pipeline.predict(['hi'])[0]

'Hi human, please tell me your AVTI user'

In [9]:
pipeline.predict(['Are you human?'])[0]

"i don't think so."

Save chatbot-model

In [11]:
# dump(pipeline, open('models/DT_chatbot.pkl', 'wb'))

Load chatbot-model

In [4]:
model = load(open('models/DT_chatbot.pkl', 'rb'))

In [11]:
model.predict(['how are you?'])[0]

'Hello, I am great, how are you? Please tell me your AVTI user'

#### Try another algorithm such as Multinomial Naive Bayes

In [31]:
pipeline_MNB = Pipeline([
                         ('bow',CountVectorizer(analyzer=cleaner)),
                         ('tfidf',TfidfTransformer()),
                         ('classifier',MultinomialNB())
                        ])

In [32]:
pipeline_MNB.fit(df['sentence_1'], df['sentence_2'])

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function cleaner at 0x0000022F9C94A940>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [33]:
pipeline_MNB.predict(['Are you human?'])[0]

'what do you mean?'

In [34]:
pipeline_MNB.predict(['die'])[0]

'what do you mean?'

In [35]:
pipeline_MNB.predict(['hi, how are you?'])[0]

'what do you mean?'

In [36]:
pipeline_MNB.predict(['fuck you'])[0]

'what do you mean?'

#### Try another algorithm such as RandomForest

In [16]:
pipeline_RM = Pipeline([
                         ('bow',CountVectorizer(analyzer=cleaner)),
                         ('tfidf',TfidfTransformer()),
                         ('classifier',RandomForestClassifier(n_estimators=150))
                        ])

In [17]:
pipeline_RM.fit(df['sentence_1'], df['sentence_2'])

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function cleaner at 0x000001DFF1C8B8B0>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', RandomForestClassifier(n_estimators=150))])

In [18]:
pipeline_RM.predict(['Are you human?'])[0]

'somebody should check his birth record.'

In [19]:
pipeline_RM.predict(["i'm scared about machines"])[0]

'about finding a job for me.'

In [20]:
pipeline_RM.predict(["fuck you"])[0]

'How rude'

In [21]:
pipeline_RM.predict(["tell me a joke"])[0]

"'Doc, I can't stop singing 'The Green, Green Grass of Home.'' 'That sounds like Tom Jones Syndrome.' 'Is it common?' 'It's Not Unusual.'"

In [22]:
pipeline_RM.predict(["do you know any joke?"])[0]

"i can't remember jokes."

In [23]:
pipeline_RM.predict(["Make me laught"])[0]

'what time does it start?'

In [24]:
pipeline_RM.predict(["What's the weather?"])[0]

'it was hot and sunny every day.'

In [25]:
pipeline_RM.predict(["Do you like watching films?"])[0]

"i'm an open book. watch me all you want."

Save model

In [26]:
# dump(pipeline_RM, open('models/RF_chatbot.pkl', 'wb'))

34 GB model.. need to use joblib library

In [27]:
joblib.dump(pipeline_RM, 'models/RF_chatbot.pkl')

['models/RF_chatbot.pkl']