In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

import torch

from transformers import pipeline
from transformers import BartTokenizer, BartForConditionalGeneration

import scipy.stats as stats

from scipy.spatial.distance import pdist, squareform
from sklearn import linear_model
from sklearn.metrics import r2_score

import unicodedata

# Text analysis
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SpanishStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('tokenizers/punkt/spanish.pickle')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading tokenizers/punkt/spanish.pickle: Package
[nltk_data]     'tokenizers/punkt/spanish.pickle' not found in index
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jose\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
tokenize_spanish = nltk.data.load('tokenizers/punkt/spanish.pickle')
spanish_stemmer = SpanishStemmer()
spanish_stopwords_th = stopwords.words('spanish')

In [3]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_text(text, subject_name, 
               tokenize_spanish=tokenize_spanish, 
               spanish_stopwords_th=spanish_stopwords_th, 
               spanish_stemmer=spanish_stemmer, 
               use_stemmer=True):
    
    text = text.lower()
    text = text.replace('?', "").replace('¿', "").replace('!', "").replace('¡', "")
    if subject_name:
        text = text.replace(subject_name, "")
    text = strip_accents(text)
    
    text_token_list = []
    if use_stemmer:
        for word in text.split():
            token_text = tokenize_spanish.tokenize(word)
            if len(token_text) > 0 and word not in spanish_stopwords_th:
                text_token_list.append(spanish_stemmer.stem(token_text[0]))
    else:
        text_token_list.append(text)
            
    text = " ".join(text_token_list)
    
    return text

In [4]:
all_df_list = []
for conv in os.listdir("../Conversations/"):
    excel_name = [file for file in os.listdir("../Conversations/" + conv) if ".xlsx" in file][0]
    df_x = pd.read_excel("../Conversations/" + conv + "/" + excel_name)
    if df_x.shape[0] > 2:
        all_df_list.append(df_x)
            
print("Number of conversations:", len(all_df_list))

Number of conversations: 13


In [5]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [27]:
keep_last_mssg = 3
df = all_df_list[0]
df_cut = df.iloc[:(df.shape[0]-keep_last_mssg)]
df_small = df.iloc[(df.shape[0]-keep_last_mssg):]

all_text_paired = list(zip(df_cut["Source"].values, df_cut["EnglishMessage"].values))
text_list = [": ".join(text) for text in all_text_paired]
whole_text = " ".join(text_list)
num_words_conv = len(whole_text.split())
print("Num words", num_words_conv)

if len(whole_text.split()) > 50:
    answer = summarizer(whole_text, max_length=100, min_length=10)
    whole_answer = answer[0]["summary_text"]

all_text_paired = list(zip(df_small["Source"].values, df_small["EnglishMessage"].values))
text_list = [": ".join(text) for text in all_text_paired]
whole_text_small = " ".join(text_list)

print(whole_answer + " " + whole_text_small)

Num words 244
Susan is a college graduate and works at a grocery store as a cashier. She tells her friend that she is a physicist and that she likes her job. Bot:  Oh, that's cool. What kind of data do you work with? How do you like it? Person: Well, right now I'm working with the conversation you're giving me Bot:  Oh really? What is it about? What is going on in your life??
