# Unsupervised clustering of friends

In [2]:
%matplotlib inline

In [168]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import glob
import re
from functools import partial

import transliterate
from transliterate.contrib.languages.bg.translit_language_pack import BulgarianLanguagePack
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bulstem import stem
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

In [4]:
ME = "–ü–∞–≤–µ–ª –ë–æ–≥–¥–∞–Ω–æ–≤"

### Reading data

In [5]:
def json_to_df(filepath, friend_name=True):
    """
    Reads facebook messenger's JSON file and returns a pandas Dataframe.
    
    Doesn't return a dataframe if the participants are more
    than two people(no group chats).
    
    Works only on the "Messages" json files downloaded through
    Facebook's "Download Your Information" section.
    
    Parameters
    ----------
    filepath : string
        Filepath to the JSON file.
    friend_name : boolean, default True
        If True, adds an aditional column "friend_name" to the df.
    Returns
    -------
    result : Dataframe        
    """
    # Fixes bad encoding
    fix_mojibake_escapes = partial(re.compile(rb'\\u00([\da-f]{2})').sub, lambda m: bytes.fromhex(m.group(1).decode()))
    
    # Need to read as binary to decode correctly
    with open(filepath, 'rb') as file:    
        repaired = fix_mojibake_escapes(file.read())
        data = json.loads(repaired.decode('utf8'), strict=False)
        
        # No group chats!
        if len(data['participants']) == 2:
            result = pd.DataFrame.from_dict(data['messages'])
            
            # Additional column
            if friend_name:
                participants = pd.Series(data['participants']).apply(pd.Series)
                for name in participants.name:
                    if not name == ME:
                        result['friend_name'] = name
            return result

In [6]:
all_files = glob.glob("messages/inbox/*/message_1.json")
data = pd.concat((json_to_df(filename) for filename in all_files), ignore_index=True, sort=False)

### Processing data

In [7]:
data.timestamp_ms = pd.to_datetime(data.timestamp_ms, unit='ms')
data = data.sort_values('timestamp_ms')
data = data.drop_duplicates('timestamp_ms')
data = data.set_index('timestamp_ms', verify_integrity=True)
data.index.names = ['timestamp']
data.tail()

Unnamed: 0_level_0,content,sender_name,share,type,friend_name,photos,sticker,audio_files,gifs,reactions,videos,files,call_duration,missed,users,plan
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-07-22 15:09:15.928,"–¥–æ–±—Ä–µ, —á–∞–∫–∞–π —Å–µ–≥–∞ –Ω–µ–¥–µ–π –¥–∞ –ø–∏—à–µ—à –∑–∞ –∏–∑–≤–µ—Å—Ç–Ω–æ –≤...",–ü–∞–≤–µ–ª –ë–æ–≥–¥–∞–Ω–æ–≤,,Generic,Daniel Petrov,,,,,,,,,,,
2019-07-22 15:09:31.911,some English text 123 wohoo yeaaah !!! xD,–ü–∞–≤–µ–ª –ë–æ–≥–¥–∞–Ω–æ–≤,,Generic,Daniel Petrov,,,,,,,,,,,
2019-07-22 15:09:36.887,,Daniel Petrov,,Generic,Daniel Petrov,,{'uri': 'messages/stickers_used/39178562_15051...,,,,,,,,,
2019-07-22 15:09:55.550,sh1okavica bum bam nqkvi dumi,–ü–∞–≤–µ–ª –ë–æ–≥–¥–∞–Ω–æ–≤,,Generic,Daniel Petrov,,,,,,,,,,,
2019-07-22 15:10:12.635,–î–∞–Ω—á–æ –µ —Å—É–ø–µ—Ä –ø–∏—á üòé,–ü–∞–≤–µ–ª –ë–æ–≥–¥–∞–Ω–æ–≤,,Generic,Daniel Petrov,,,,,,,,,,,


In [8]:
mess = data[(data.type=='Generic') | (data.type=='Share')][['sender_name', 'content', 'friend_name', 'type', 'share', 'reactions']]
mess.content.dropna(inplace=True)
mess.sender_name = mess.sender_name.astype('category')
mess.friend_name = mess.friend_name.astype('category')
mess.type = mess.type.astype('category')
mess.content = mess.content.str.lower()
mess.content = mess.content.replace(np.nan, '0')
mess = mess[mess.content!='0']

In [13]:
mess.tail()

Unnamed: 0_level_0,sender_name,content,friend_name,type,share,reactions
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-07-22 15:08:48.294,Daniel Petrov,–∞–∫–æ –º–Ω –≥–æ –∑–∞–∫—ä—Å–∞—à —Å —Å—ä–±–∏—Ä–∞–Ω–µ—Ç–æ –Ω–∞ —Å—ä–æ–±—â–µ–Ω–∏—è—Ç–∞ ...,Daniel Petrov,Generic,,
2019-07-22 15:09:15.928,–ü–∞–≤–µ–ª –ë–æ–≥–¥–∞–Ω–æ–≤,"–¥–æ–±—Ä–µ, —á–∞–∫–∞–π —Å–µ–≥–∞ –Ω–µ–¥–µ–π –¥–∞ –ø–∏—à–µ—à –∑–∞ –∏–∑–≤–µ—Å—Ç–Ω–æ –≤...",Daniel Petrov,Generic,,
2019-07-22 15:09:31.911,–ü–∞–≤–µ–ª –ë–æ–≥–¥–∞–Ω–æ–≤,some english text 123 wohoo yeaaah !!! xd,Daniel Petrov,Generic,,
2019-07-22 15:09:55.550,–ü–∞–≤–µ–ª –ë–æ–≥–¥–∞–Ω–æ–≤,sh1okavica bum bam nqkvi dumi,Daniel Petrov,Generic,,
2019-07-22 15:10:12.635,–ü–∞–≤–µ–ª –ë–æ–≥–¥–∞–Ω–æ–≤,–¥–∞–Ω—á–æ –µ —Å—É–ø–µ—Ä –ø–∏—á üòé,Daniel Petrov,Generic,,


### Top friends

In [161]:
n = 15
space_mess = mess.copy()
space_mess.content = space_mess.content.str.split()
space_mess.content.dropna(inplace=True)
space_mess['wordcount_space'] = space_mess.content.apply(len)
best_friends = space_mess.groupby('sender_name').wordcount_space.sum().nlargest(len(list(space_mess.sender_name.unique())))
best_space_mess = space_mess[space_mess.friend_name.isin(list(best_friends[1:n+1].index))]
best_space_mess.reset_index(inplace=True)

### Transliteration 

In [149]:
BulgarianLanguagePack.mapping = ('abwvgdejziyklmnoprstufhc461q', '–∞–±–≤–≤–≥–¥–µ–∂–∑–∏–π–∫–ª–º–Ω–æ–ø—Ä—Å—Ç—É—Ñ—Ö—Ü—á—à—ä—è')
"""
my additions:
    "1": "—ä",
    "6": "—à",
    "4": "—á",
    "q": "—è",
    "w": "–≤",
    "j": "–∂",
    "c": "—Ü",
    
removed all the uppercase characters for improved performance
ABVWGDEZIYKLMNOPRSTUFHCQ:–ê–ë–í–í–ì–î–ï–ó–ò–ô–ö–õ–ú–ù–û–ü–†–°–¢–£–§–•–¶–Ø    
"""


BulgarianLanguagePack.reversed_specific_mapping = ('—å—ä', 'y1')


BulgarianLanguagePack.pre_processor_mapping = {
    'zh': '–∂',
    'ts': '—Ü',
    'ch': '—á',
    'sh': '—à',
    '—àt': '—â', # fixed
    'yu': '—é',
    'ya': '—è',
    
    # my additions:
    "6t": "—â",
    "1o": "—å–æ",    
    "1i": "—ä–π",
    "ai": "–∞–π",
    # "–π–æ" only after vowel
    # and beginning of word
    "yo": "–π–æ",
    "b–π": "–±—å",
    "v–π": "–≤—å",
    "w–π": "–≤—å",
    "g–π": "–≥—å",
    "d–π": "–¥—å",
    "j–π": "–∂—å",
    "–∂–π": "–∂—å",
    "z–π": "–∑—å",
    "k–π": "–∫—å",
    "l–π": "–ª—å",
    "m–π": "–º—å",
    "n–π": "–Ω—å",
    "p–π": "–ø—å",
    "r–π": "—Ä—å",
    "s–π": "—Å—å",
    "t–π": "—Ç—å",
    "f–π": "—Ñ—å",
    "c–π": "—Ü—å",
    "—á–π": "—á—å",
    "—à–π": "—à—å",
    "—â–π": "—â—å"}
"""
    removed:
    'Zh': '–ñ',
    'Ts': '–¶',
    'Ch': '–ß',
    'Sh': '–®',
    '–®t': '–©', # fixed
    'Yu': '–Æ',
    'Ya': '–Ø',
"""

# Instead of callng the language code each time
translit_bg = transliterate.get_translit_function('bg')

### Wordlists

In [152]:
en_wordlist = pd.read_csv('en_words.csv', header=None, names=['word'])
bg_wordlist = pd.read_csv('bg_full.txt', header=None, delim_whitespace=True, names=['word', 'frequency'])
bg_wordlist = bg_wordlist.dropna()
bg_wordlist = bg_wordlist[bg_wordlist.frequency > 2]
bg_wordlist = bg_wordlist[bg_wordlist.word.str.contains(pat=r'[a-zA-Z]+')==False].reset_index(drop=True)

### Stop words

In [150]:
bg_stopwords = pd.read_csv('https://raw.githubusercontent.com/Alir3z4/stop-words/master/bulgarian.txt', header=None, names=['word']).word
en_stopwords = pd.Series(stopwords.words('english'))
combined_stopwords = pd.concat([bg_stopwords, en_stopwords])

### Word split

In [162]:
def word_split(dataframe):
    rows = list()
    for row in dataframe[['sender_name', 'content', 'timestamp']].iterrows():
        r = row[1]
        for word in r.content:
            rows.append((r.sender_name, word, r.timestamp))

    return pd.DataFrame(rows, columns=['sender_name', 'word', 'timestamp'])

In [178]:
words_by_space = word_split(best_space_mess)

### Extracting valid words

In [179]:
words_by_space = words_by_space[words_by_space.word.apply(len) >= 3]
words_by_space.sender_name = words_by_space.sender_name.astype('category')
words_by_space.word = words_by_space.word.astype('category')

In [180]:
bg_words = words_by_space[(words_by_space.word.isin(bg_wordlist.word))
                              & (words_by_space.word.isin(en_wordlist.word)==False)]

en_words = words_by_space[(words_by_space.word.isin(en_wordlist.word)) 
                              & (words_by_space.word.isin(bg_wordlist.word)==False)]

recycled_words = words_by_space[(words_by_space.word.isin(bg_wordlist.word)==False) 
                                    & (words_by_space.word.isin(en_wordlist.word)==False)]

In [181]:
recycled_words = recycled_words.copy()
recycled_words.word = recycled_words.word.apply(translit_bg)
recycled_words = recycled_words[recycled_words.word.isin(bg_wordlist.word)]
bg_words = bg_words[bg_words.word.isin(bg_stopwords)==False]
en_words = en_words[en_words.word.isin(en_stopwords)==False]
recycled_words = recycled_words[recycled_words.word.isin(bg_stopwords)==False]
bg_words.word = bg_words.word.apply(stem)
en_words.word = en_words.word.apply(PorterStemmer().stem)
recycled_words.word = recycled_words.word.apply(stem)

In [182]:
processed_words = pd.concat(objs=[bg_words, en_words, recycled_words])

In [192]:
processed_mess = processed_words.groupby(["sender_name"]).word.agg(lambda x: ' '.join(x))

In [193]:
processed_mess = pd.DataFrame(processed_mess)
processed_mess = processed_mess.reset_index(level=0).set_index('sender_name')
processed_mess.columns = ['content']

### Model

In [213]:
vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.4, stop_words=combined_stopwords.tolist())
X = vectorizer.fit_transform(raw_documents=processed_mess.content)
X_norm = normalize(X)

In [214]:
true_k = n + 1
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, n_init=10, n_jobs=8)
model.fit(X_norm)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1000,
    n_clusters=16, n_init=10, n_jobs=8, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [215]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [220]:
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :20]:
        print(' %s' % terms[ind])

Cluster 0:
 –¥–æ–∫—Ç–æ—Ä
 –Ω–µ–∑–Ω–∞
 —â–æ—Ç
 –∞–¥–∏
 –º–∞–ª–µ
 –¥–µ—Ç
 –ø—Ä–∏–Ω—Ü–∏–ø
 –¥–∞—Å–∫–∞–ª
 —Ç–æ–µ—Å—Ç
 –π–µ—Å
 –¥–æ–æ–±
 —Å–º—è—Ç–∞
 –º–¥–∞–∞
 —Å–∏–≥
 –µ–µ–µ
 –º–∞–º–∫
 —Ç–∞–º—ä–Ω
 –∫–∏—Ç–∞—Ä
 –µ–ø–∏–∑–æ–¥
 –≤—ä–æ–±—â–µ
Cluster 1:
 like
 know
 –ø—Ä–∏–Ω—Ü–∏–ø
 —Ç–æ–∑
 hate
 –Ω—è–∫–∞–∫–≤
 –æ–±–∏—á–∞–º
 fuck
 —Ç–∏—è
 love
 shit
 one
 kind
 want
 say
 –±–∞—Å–∏
 –±—É—Ä–≥–∞—Å
 –¥–æ–±
 –±—Ç–≤
 –º—Ä—ä–Ω–∫–∞
Cluster 2:
 –¥–∞–µ
 —Ç—Ä—è
 —Ç–µ–π
 —Ç–∏—è
 –¥–∞–Ω–Ω
 –Ω—è–∫—ä–≤
 ama
 –∞–º–∏–∏
 dae
 –ø–æ–ª–∑–≤–∞
 —Å—Ö–µ–º
 —Å–∫–∏–≤
 —á–µ–∫
 sum
 –∫–≤–∏
 —É—Å–ª–æ–≤–∏
 —Ü–∏–∫—ä–ª
 –∫–æ–¥–∞
 che
 –ª–∞–π–Ω
Cluster 3:
 ama
 sum
 ako
 –ø—É–∫
 ima
 —Ç—Ä—è–∞
 che
 –Ω–µ–∑–Ω–∞
 taka
 aha
 —â–æ
 ami
 tam
 imam
 —Ç–µ–±–µ
 sled
 –¥–µ—Å–µ
 —â–æ—Ç
 aid
 –∫—É–¥–µ
Cluster 4:
 well
 like
 –¥–∞–µ
 –ª–æ–ª
 —â–æ—Ç
 –∏–¥–∫
 know
 —Ç—Ä—è–∞
 yeah
 —Ç–µ–±–µ
 –ø—Ä–∏–Ω—Ü–∏–ø
 get
 –Ω–∏–π
 –º–µ–Ω–µ
 —Ç–µ—è
 one
 –±–∞—Å–∏
 –Ω—è–∞
 –≤—ä–æ–±—â–µ
 –∑–Ω–∞–π—à
Cluster 5:
 –∞—Ö–∞–º
 –Ω–µ–∑–Ω–∞
 –≤–∏–∫—Ç–æ—Ä
 —Å–º—è—Ç–∞
 —â–æ