In [9]:
import os
import pandas as pd
pd.set_option('display.max_rows', 500)
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

import re
import unicodedata
import string
import emoji
import pickle

import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import  TfidfVectorizer 
from sklearn.decomposition import NMF

from collections import Counter

In [10]:
with open('history_clean.pkl', 'rb') as picklefile:
     history_clean = pickle.load(picklefile)

In [12]:
# Group messages by name and date from the same conversation.

def groupby_messages(data,n):
    new_data = data.sort_values(by=['tipo','category','conv_participants','total_conv_msg','total_own_msg','participation','conv_name','name','date1'])
    
    new_data['group'] = new_data.groupby(['tipo','category','conv_participants','total_conv_msg','total_own_msg','participation','conv_name','name','date1']).cumcount()
    new_data['group'] = new_data['group'].apply(lambda x: np.floor(x/float(n)))
    
    new_data = new_data.groupby(['tipo','category','conv_participants','total_conv_msg','total_own_msg','participation','conv_name','name','date1','group'])['msg'].sum().reset_index()                                
    return new_data
    
                              

In [13]:
history_clean = groupby_messages(history_clean,5)

In [14]:
# Delete " ' '" in words 
def elimina_tildes(cadena):
    s = ''.join((c for c in unicodedata.normalize('NFD',cadena) if unicodedata.category(c) != 'Mn'))
    return s


def custom_tokenizer(text):

    # remove punctuation
    remove_punct = str.maketrans('', '', string.punctuation)
    text = text.translate(remove_punct)

    # remove digits and convert to lower case
    remove_digits = str.maketrans('', '', string.digits)
    text = text.lower().translate(remove_digits)
    
    # remove 'tildes'
    text = elimina_tildes(text)
    
    # remove duplicated letters
    text = re.sub(r'([a-z])\1+', r'\1', text)

    # jaja
    text = re.sub(r'(ja)[ja]*', 'ja', text)

    # tokenize
    tokens = word_tokenize(text)

    # remove stop words
    stop_words = stopwords.words('spanish')
    tokens_stop = [y for y in tokens if y not in stop_words]
    
    return tokens_stop

In [15]:
def run_NMF_model(data,max_df,n_components):
    tfidf = TfidfVectorizer(tokenizer=custom_tokenizer,max_df=max_df,min_df = 100) 
    X = tfidf.fit_transform(data)
       
    nmf = NMF(n_components=n_components,random_state=0)
    doc_topics = nmf.fit_transform(X)
    
    # Assigning component number to each document
    t = np.argmax(doc_topics,axis=1)
    
    # Counting number of document in each component
    counts = pd.Series(t).value_counts()
    
    # Getting the top 10 words for each component
    d = nmf.components_
    w = tfidf.get_feature_names()
    words = []
    for r in range(len(d)):
        a = sorted([(v,i) for i,v in enumerate(d[r])],reverse=True)[0:10]
        words.append([w[e[1]] for e in a])
    
    return doc_topics, t, words

In [16]:
# Final model after iterating through different possibilities
doc_topics, t, words = run_NMF_model(history_clean['msg'],0.50,40)

In [17]:
# Manual grouping of topics into 13 categories according to key words.
clas = ['surprise', 'making plans', 'small talk', 'confirmation',
       'bad news', 'thanks', 'good wishes', 'small talk', 'posponed plan',
       'bad news', 'making plans', 'good news', 'asking', 'making plans',
       'making plans', 'posponed plan', 'work', 'making plans', 'work',
       'bad news', 'asking', 'love', 'making plans', 'work',
       'confirmation', 'making plans', 'making plans', 'thinking',
       'good wishes', 'bad news', 'making plans', 'making plans',
       'bad news', 'making plans', 'love', 'bad news', 'making plans',
       'making plans', 'love', 'good news']

In [18]:
classification = pd.DataFrame({"topic":range(40),"classification":clas, 'words':words})

In [20]:
# Final Classification, topic number and top 10 words for each one.
classification

Unnamed: 0,classification,topic,words
0,surprise,0,"[ja, 😂, 😜, posta, sos, viste, tremendo, foto, ..."
1,making plans,1,"[si, queres, obvio, podes, tenes, digo, pasa, ..."
2,small talk,2,"[q, x, dijo, pq, onda, dice, xq, dije, tmb, pe..."
3,confirmation,3,"[dale, beso, queres, salgo, avisame, 😁, hora, ..."
4,bad news,4,"[igual, da, solo, siempre, mejor, paja, entien..."
5,thanks,5,"[gracias, mil, muchas, 😊, ❤❤❤, 😁, beso, genia,..."
6,good wishes,6,"[feliz, cumple, ano, tio, tia, lindo, dani, pa..."
7,small talk,7,"[bien, parece, hola, serio, viene, uy, cae, su..."
8,posponed plan,8,"[bueno, entonces, vemos, mejor, dejo, uh, meno..."
9,bad news,9,"[ah, entendi, entonces, cierto, pense, entiend..."


In [22]:
# Merging results with the original data frame
model1 = history_clean
model1['topic'] = t
model1['sent'] = model1['name'].apply(lambda x: 'sent' if x=='Lai' else 'received')
model1 = model1.merge(classification,on='topic')


In [25]:
# For the visualization, I dropped the messages
model1.drop('msg',axis=1).to_csv('model1_viz.csv')