In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import json
import random
import seaborn as sns
import re

import spacy

# Text analysis
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SpanishStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('tokenizers/punkt/spanish.pickle')
nltk.download('stopwords')

import unicodedata

from wordcloud import WordCloud, STOPWORDS

# ### Scikit-Learn ###
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics as skmetrics

import analysis_utils as aute

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading tokenizers/punkt/spanish.pickle: Package
[nltk_data]     'tokenizers/punkt/spanish.pickle' not found in index
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jose\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path_to_convs = "D:/GuardarDatosLableniBOT"

all_df_list = []
for subj in os.listdir(path_to_convs):
    if "User" in subj:
        path_to_conv_2 = path_to_convs + "/" + subj + "/" + subj + "_Conversations/"
        for file in os.listdir(path_to_conv_2):
            if "User" in file:
                path_to_subj_conv = path_to_conv_2 + file
                try:
                    csv_name = [file for file in os.listdir(path_to_subj_conv) if ".csv" in file][0]
                    df_x = pd.read_csv(path_to_subj_conv + "/" + csv_name, sep=";")
                    if df_x.shape[0] > 2:
                        all_df_list.append(df_x)
                except:
                    print("Bad subject", path_to_subj_conv + "/" + csv_name)

Bad subject D:/GuardarDatosLableniBOT/User_02_CB/User_02_CB_Conversations/User_02_CB_20221116_123151/Conv_20221116_122202.csv
Bad subject D:/GuardarDatosLableniBOT/User_10_CB/User_10_CB_Conversations/GuideOfTimes_User_10_CB.csv/Conv_20221118_144128.csv
Bad subject D:/GuardarDatosLableniBOT/User_10_CB/User_10_CB_Conversations/GuideOfTimes_User_10_CB.pkl/Conv_20221118_144128.csv
Bad subject D:/GuardarDatosLableniBOT/User_45_CB/User_45_CB_Conversations/User_45_CB_20221215_141244/Conv_20221215_140335.csv
Bad subject D:/GuardarDatosLableniBOT/User_45_CB/User_45_CB_Conversations/User_45_CB_20221215_143002/Conv_20221215_142512.csv
Bad subject D:/GuardarDatosLableniBOT/User_50_CB/User_50_CB_Conversations/User_50_CB_20221219_144734/Conv_20221219_144031.csv


In [5]:
print("Number of conversations", len(all_df_list))

Number of conversations 312


In [37]:
result_process_data_list = []
for data in all_df_list:
    
    personality, gender, _ = data["ConfigName"].iloc[0].split("_")
    
    gender = "Female" if gender == "Woman" else gender
    gender = "Male" if gender == "Man" else gender
    
    human_time_talk = (data["PersonTalkEndUnix"] - data["PersonTalkStartUnix"]).dropna().values
    bot_time_talk = (data["BotTalkEndUnix"] - data["BotTalkStartUnix"]).dropna().values
    conv_time = (data["UnixTimeSave"].iloc[-1] - data["UnixTimestampLoopInit"].iloc[0])/60
    max_num_pair_sentence = np.max([
        label for label in data["ConversationSentenceId"].unique() if np.sum(data["ConversationSentenceId"] == label) == 2
    ])
    result_process_data_list.append({
        "SubjectId": data["SubjectId"].iloc[0],
        "ChatbotName": data["ChatbotName"].iloc[0],
        "Gender": gender,
        "Personality": personality,
        "UnixLoopStart": data["UnixTimestampLoopInit"].iloc[0],
        "MeanHumanTime_s": np.mean(human_time_talk),
        "StdHumanTime_s": np.std(human_time_talk),
        "MaxHumanTime_s": np.max(human_time_talk),
        "MeanBotTime_s": np.mean(bot_time_talk),
        "StdBotTime_s": np.std(bot_time_talk),
        "MaxBotTime_s": np.max(bot_time_talk),
        "ConvTime_min": conv_time,
        "NumberOfSentences": data.shape[0],
        "MaxNumQA": max_num_pair_sentence
    })
    
df_res_process = pd.DataFrame(result_process_data_list)

In [39]:
print(df_res_process.columns)
print("Shape", df_res_process.shape)
df_res_process.head(10)

Index(['SubjectId', 'ChatbotName', 'Gender', 'Personality', 'UnixLoopStart',
       'MeanHumanTime_s', 'StdHumanTime_s', 'MaxHumanTime_s', 'MeanBotTime_s',
       'StdBotTime_s', 'MaxBotTime_s', 'ConvTime_min', 'NumberOfSentences',
       'MaxNumQA'],
      dtype='object')
Shape (312, 14)


Unnamed: 0,SubjectId,ChatbotName,Gender,Personality,UnixLoopStart,MeanHumanTime_s,StdHumanTime_s,MaxHumanTime_s,MeanBotTime_s,StdBotTime_s,MaxBotTime_s,ConvTime_min,NumberOfSentences,MaxNumQA
0,User_01_CB,Laura,Female,Neutral,1668588000.0,10.987555,5.116723,18.949747,9.134379,4.181297,17.346042,2.721243,13,5
1,User_01_CB,David,Male,Neutral,1668589000.0,7.830935,1.58407,9.427005,7.3074,2.099484,10.81857,1.462437,9,3
2,User_01_CB,María,Female,Happy,1668589000.0,9.411009,3.463359,15.750861,10.265537,4.970026,17.750785,2.05595,10,4
3,User_01_CB,Pablo,Male,Relax,1668589000.0,9.390276,3.389688,15.737232,9.74434,3.805364,18.376184,3.045179,15,6
4,User_01_CB,Sara,Female,Sad,1668590000.0,7.026681,1.38041,9.435585,5.3611,1.693569,7.381239,2.312869,17,7
5,User_01_CB,Jorge,Male,Angry,1668590000.0,8.156119,2.54003,12.621134,8.949473,5.886838,17.846622,2.034079,11,4
6,User_02_CB,Alejandro,Male,Neutral,1668596000.0,10.650504,7.062532,30.71179,5.655437,1.36632,7.744314,4.15695,25,11
7,User_02_CB,Laura,Female,Neutral,1668597000.0,6.619163,1.038502,9.366427,6.100837,1.254352,8.661257,2.305496,17,7
8,User_02_CB,Javier,Male,Happy,1668597000.0,6.967394,1.330694,9.403758,4.812545,1.607716,7.006487,3.456228,27,12
9,User_02_CB,Lucía,Female,Relax,1668598000.0,7.526045,1.899582,12.557345,3.922697,1.132431,7.405552,4.302142,35,16


In [44]:
df_res_process["SubjectId"] = df_res_process["SubjectId"].replace("Alejandro", "User_10_CB")

In [46]:
df_res_process["SubjectId"].unique()

array(['User_01_CB', 'User_02_CB', 'User_03_CB', 'User_04_CB',
       'User_05_CB', 'User_06_CB', 'User_07_CB', 'User_08_CB',
       'User_09_CB', 'User_10_CB', 'User_11_CB', 'User_12_CB',
       'User_13_CB', 'User_14_CB', 'User_15_CB', 'User_16_CB',
       'User_17_CB', 'User_18_CB', 'User_19_CB', 'User_20_CB',
       'User_21_CB', 'User_22_CB', 'User_23_CB', 'User_24_CB',
       'User_25_CB', 'User_26_CB', 'User_27_CB', 'User_28_CB',
       'User_29_CB', 'User_30_CB', 'User_31_CB', 'User_32_CB',
       'User_33_CB', 'User_34_CB', 'User_35_CB', 'User_36_CB',
       'User_37_CB', 'User_38_CB', 'User_39_CB', 'User_40_CB',
       'User_41_CB', 'User_42_CB', 'User_43_CB', 'User_44_CB',
       'User_45_CB', 'User_46_CB', 'User_47_CB', 'User_48_CB',
       'User_49_CB', 'User_50_CB', 'User_51_CB', 'User_52_CB'],
      dtype=object)

In [47]:
df_res_process.to_csv("Results/DataConvProcessed.csv", sep=";", index=False)