In [None]:
import nltk
import json
import os
import os.path
import shutil
import pandas
from joblib import dump, load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
owd = os.getcwd()

# Ingest JSON Data

In [None]:
# Set current working directory to json data
os.chdir(owd)
os.chdir("/data/live/json")
# Store filenames of all files in the directory to files.
files = []
for path in os.listdir(os.getcwd()):
    # check if current path is a file
    if os.path.isfile(os.path.join(os.getcwd(), path)):
        files.append(path)

messages = []
for file_name in files:
    with open(file_name,"r") as read_file:
        data = json.load(read_file)
        if len(data['participants']) == 2:
            person_order = 0
            if ("Ethan L" in data['participants'][person_order].get('name')):
                person_order = 1
            contact = data['participants'][person_order].get('name')
            conversation = pandas.DataFrame(data['messages'])
            conversation['Contact'] = contact
            messages.append(conversation)

messages = pandas.concat(messages, ignore_index=True)
messages["datetime"] = pandas.to_datetime(messages.timestamp_ms, unit='ms')

os.chdir(owd)
os.chdir("/data/live/csv")
messages.to_csv("messages.csv")

# Train the model

In [None]:
nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()


posts_text = [post.text for post in posts]

#divide train and test in 80 20
train_text = posts_text[:int(len(posts_text)*0.8)]
test_text = posts_text[int(len(posts_text)*0.2):]

#Get TFIDF features
vectorizer = TfidfVectorizer(ngram_range=(1,3), 
                             min_df=0.001, 
                             max_df=0.7, 
                             analyzer='word')

X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)

y = [post.get('class') for post in posts]

y_train = y[:int(len(posts_text)*0.8)]
y_test = y[int(len(posts_text)*0.2):]

# Fitting Gradient Boosting classifier to the Training set
#Can be improved with Cross Validation
gb = GradientBoostingClassifier(n_estimators = 400, random_state=0)
gb.fit(X_train, y_train)

predictions_rf = gb.predict(X_test)
print(classification_report(y_test, predictions_rf))

# Save model to directory.
os.chdir(owd)
# dump(gb,'gradient_boosted_post_class.joblib')
os.chdir(owd)

In [None]:
# Predict message type
clean_messages = ['a statement' if type(i)==float else i for i in messages['content'].to_list()]
vectorized_messages = vectorizer.transform(clean_messages)
gb_text_predictions = gb.predict(vectorized_messages)

In [None]:
messages['type_prediction'] = gb_text_predictions
messages = messages.sort_values(by=['timestamp_ms'], ignore_index=True)
messages.to_csv('messages.csv')

In [None]:
response_time_list = [None for i in range(len(messages))]
for index, row in messages.iterrows():
    if ('Question' in row['type_prediction']) & ('Ethan L' not in messages['sender_name']):
        question_time = row['timestamp_ms']
        convo_contact = row['Contact']
        response_table = messages[(messages['timestamp_ms'] > question_time) & (messages['Contact'] == convo_contact) & (['Ethan L' in x for x in messages['sender_name']])]
        response_time = None
        if len(response_table) > 0:
            response_time = response_table['timestamp_ms'].values[0]
            response_time_list[index] = response_time - question_time
response_time_list

In [None]:
%matplotlib inline
cleaned_response_time_list = [i for i in response_time_list if (i is not None and i < 200000)]
plt.hist(cleaned_response_time_list,bins=40)