In [None]:
# imports

import os
import pprint
import pickle
import glob
import json
import re
import string
import datetime
from datetime import timezone
import scipy
from scipy import stats

from collections import defaultdict

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

In [None]:
# utility objects

MILLISEC_PER_MONTH = 1000 * 60 * 60 * 24 * 30

pp = pprint.PrettyPrinter(indent=4)
def print_term_to_senders():
    pp.pprint(term_to_senders)
    
standard_contractions = set(["aren't", "can't", "could've", "couldn't", "didn't", 
            "doesn't", "don't", "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", 
            "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it'd", 
            "it'll", "it's", "let's", "might've", "mightn't", "must've", 
            "mustn't", "needn't", "o'clock", "she'd", "she'll", "she's", 
            "should've", "shouldn't", "that'd", "that's", "there'd", 
            "there's", "they'd", "they'll", "they're", "they've", "wasn't", 
            "we'd", "we'll", "we're", "we've", "weren't", "what'll", "what're", "what's",
            "what've", "when's", "when've", "where'd", "where's", "where've", "who'll", 
            "who's", "who've", "why's", "won't", "would've", 
            "wouldn't", "you'd", "you'll", "you're", "you've"])

# add in all contractions without the apostrophes also
new_contractions = set()
for contraction in standard_contractions:
    new_contractions.add(contraction.replace("'", ""))

standard_contractions = standard_contractions.union(new_contractions)

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)
lemmatizer = WordNetLemmatizer()

In [None]:
# helper functions

def preprocess(words):
    words = words.lower()
    # weird thing with apostrophe showing up as this unicode string
    words = re.sub("\u00e2\u0080\u0099", "'", words)
    # filter out all remaining Unicode, TODO: maybe want to translate these into emojis?
    words = words.encode("ascii", "ignore").decode("UTF-8")
    return words

def print_sorted_dict(d, first_x=None, actually_print=True):
    sorted_keys = sorted(d, key=d.get, reverse=True)
    for i, key in enumerate(sorted_keys):
        if first_x is not None and i >= first_x:
            break
        if actually_print:
            print(f"Rank {i+1}:", key, d[key])
    return sorted_keys

def is_contraction(word):
    if "'" in word:
        split_word = word.split("'")
        if len(split_word) > 1 and split_word[0] in english_words:
            return True
    return False

terms_to_include = set(["kys"])
def exclude_word(word):
    return (len(word) < 1 or word in english_words or word.isdigit() 
            or is_contraction(word) or word in standard_contractions) and (word not in terms_to_include)

In [None]:
# load dictionary from disk

pickled_dict_fname = "term_to_senders_dict"
term_to_senders = pickle.load(open(pickled_dict_fname, "rb")) 

# {
#     "<term>": {
#         "<friend_name>": 
#             {
#                  "<friend_name>": [<list of timestamps in ms>],
#                  "<account_user_name>": [<list of timestamps in ms>]
#             }
#         ....
#     },
#     ....
# }

print(len(term_to_senders)) # should be 9998

# prints slang ranked by frequency for a single user
friend_name = "Dan J Chong"
sender_to_term_count_fname = "sender_to_term_count"
def return_zero():
    return 0
def return_dd():    
    return defaultdict(return_zero)
sender_to_term_count = pickle.load(open(sender_to_term_count_fname, "rb")) 

In [None]:
# define terms for analysis
msg_dir = "/home/lhuang21/Documents/SideProjects/Grapevine/facebook-lawrenceh1850/messages/inbox"
account_user_name = "Lawrence Huang"
# terms where I influenced friend
# TERM = "wtf"
# TERM = "okay"

# 0 correlation
# TERM = "lol"
# TERM = "ok"

# terms where friend influenced me
# TERM = ("holy", "shit")
# TERM = ("shit")
# TERM = ("lmao")
# TERM = ("xd")
FRIEND_NAME = "Dan J Chong"

In [None]:
# calculate moving average

def term_freq_avg(start_time_milli, end_time_milli, term, friend_name, msg_dir):
    """
    Calculates average term frequency per word in a given time frame for all users involved in conversation.
    """
    if start_time_milli > end_time_milli:
        raise ValueError("End time must be after start time.")
    
    dir_prefix = friend_name.strip().replace(" ", "").lower()
    msg_file_path = os.path.join(msg_dir, dir_prefix + "*")
    candidate_dirs = glob.glob(msg_file_path)
    
    # this is for determining whether the term is a single word
    # bigram or trigram
    term_dimension = 1
    if isinstance(term, tuple):
        term_dimension = len(term)
    
    if len(candidate_dirs) != 1:
        raise ValueError("Invalid path for friend name and message directory specified")
    else:
        msg_file_path = os.path.join(candidate_dirs[0], "message_1.json") 
        
        # make sure file exists
        if not os.path.exists(msg_file_path):
            raise ValueError(f"{msg_file_path} doesn't exist")
        else:
            json_dict = json.load(open(msg_file_path, 'r'))

            word_count_in_period = {}
            term_count_in_period = {}
            
            for name_dict in json_dict["participants"]:
                participant_name = name_dict["name"]
                term_count_in_period[participant_name] = 0
                word_count_in_period[participant_name] = 0

            for index, msg in enumerate(reversed(json_dict['messages'])):
                msg_timestamp_ms = msg["timestamp_ms"]
                sender = msg["sender_name"]
                
                if msg_timestamp_ms >= start_time_milli:
                    if msg_timestamp_ms > end_time_milli:
                        break
                    
                    try:
                        content = preprocess(msg["content"])
                    except KeyError:
                        # in case there isn't a content key
                        continue
                    tok_content = content.split()
                    
                    word_count_in_period[sender] += len(tok_content)
                    
                    if term_dimension == 1:
                        # monograms
                        for word in tok_content:
                            word = word.strip(string.punctuation)
                            if len(word) > 0:
                                # only lemmatize if this was not a hand-picked word
                                if word not in terms_to_include:
                                    word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
                                if word == term:
                                    term_count_in_period[sender] += 1
                    elif term_dimension == 2:
                        # bigrams
                        for (tok1, tok2) in ngrams(tok_content, 2):
                            tok1 = tok1.strip(string.punctuation)
                            tok2 = tok2.strip(string.punctuation)
                            if term == (tok1, tok2):
                                term_count_in_period[sender] += 1
                    elif term_dimension == 3:
                        # trigrams
                        for (tok1, tok2, tok3) in ngrams(tok_content, 3):
                            tok1 = tok1.strip(string.punctuation)
                            tok2 = tok2.strip(string.punctuation)
                            tok3 = tok3.strip(string.punctuation)
                            if term == (tok1, tok2, tok3):
                                term_count_in_period[sender] += 1
                    else:
                        raise ValueError(f"Term has invalid input dimension")
                        
            for sender in term_count_in_period:
                if word_count_in_period[sender] != 0:
                    term_count_in_period[sender] /= float(word_count_in_period[sender])
                else:
                    # sender had no words in this period
                    term_count_in_period[sender] = 0
                
            return term_count_in_period

In [None]:
# function that calculates term_to_usage_rates for friend

# TODO: implement geometric weighting or some sort of moving average for earlier difference terms
# maybe even just some sort of cutoff to account for noise
# also the case where initial usage was relatively the same ad then later one person stopped 
# but the first person continued, maybe have to take into account the absolute value of the usage rate
# like if it was 0 for both people at some point that's kind of an edge case (see "kid")
def get_term_to_usage_rates_for_friend(FRIEND_NAME, top_x_terms=20):
    term_to_usage_rates = {}
    term_to_usage_rates["FRIEND_NAME"] = FRIEND_NAME
    # { 
    #   "FRIEND_NAME": "",
    #   term_1: {
    #     "friend_usage_rates": [ ... ],
    #     "user_usage_rates": [ ... ],
    #     "x_axis": x_axis
    #   },
    #   term_2 : {
    #     ...
    #   }
    # ...
    # }

    term_counter = 0
    
    freq_per_friend = print_sorted_dict(sender_to_term_count[FRIEND_NAME], actually_print=False)
    
    for TERM in freq_per_friend:
        print(f"analyzing {TERM}")
        # only count the top x terms
        if term_counter >= top_x_terms:
            break
        term_counter += 1
            
        friend_timestamps = term_to_senders[TERM][FRIEND_NAME][FRIEND_NAME]
        user_timestamps = term_to_senders[TERM][FRIEND_NAME][account_user_name]

        first_mention = None
        last_mention = None
        # no mentions
        if len(friend_timestamps) == 0 and len(user_timestamps) == 0:
            pass
        elif len(friend_timestamps) == 0:
            first_mention = user_timestamps[0]
            last_mention = user_timestamps[0]
        elif len(user_timestamps) == 0:
            first_mention = friend_timestamps[0]
            last_mention = friend_timestamps[0]
        else:
            first_mention = min(friend_timestamps[0], user_timestamps[0])
            last_mention = max(friend_timestamps[-1], user_timestamps[-1])

        # calculate usage rates across time period
        if first_mention is not None and last_mention is not None:
            x_axis = []
            friend_usage_rates = []
            user_usage_rates = []

            START_MS = first_mention
            END_MS = START_MS + MILLISEC_PER_MONTH

            while START_MS <= last_mention:
                rate_dict = term_freq_avg(START_MS, END_MS, TERM, FRIEND_NAME, msg_dir)
                START_MS = END_MS
                END_MS = START_MS + MILLISEC_PER_MONTH
                x_axis.append(datetime.datetime.utcfromtimestamp(START_MS / 1000).replace(tzinfo=datetime.timezone.utc))
                friend_usage_rates.append(rate_dict[FRIEND_NAME])
                user_usage_rates.append(rate_dict[account_user_name])

            term_to_usage_rates[TERM] = {
                "friend_usage_rates": friend_usage_rates,
                "user_usage_rates": user_usage_rates,
                "x_axis": x_axis
            }

            # old dump to file
            # pickle.dump(term_to_usage_rates, open(term_to_usage_rates_fname, "wb"))
        else:
            raise ValueError(f"Should not reach here: No mentions of {TERM} found for {FRIEND_NAME}")
    
    return term_to_usage_rates

In [None]:
# write term_to_usage_rates to disk for every friend, don't redo if it already exists
# TODO: make a nice input box for that

friend_to_term_usage_rates_fname = "friend_to_term_usage_rates"
friend_to_term_usage_rates = {}
# { 
#   <FRIEND_NAME_1>: <term_to_usage_rates>,
#   <FRIEND_NAME_2>: <term_to_usage_rates>,
#   ...
# }

friend_to_words = pickle.load(open("friend_to_words", "rb"))

num_friends = 20
print(f"Top {num_friends} friends:")
sorted_friends = print_sorted_dict(friend_to_words, first_x=num_friends)

redo_analysis = True
if os.path.exists(friend_to_term_usage_rates_fname):
    print(f"\n{friend_to_term_usage_rates_fname} dictionary exists already.")
    user_input = None
    while user_input is None or not (user_input == "y" or user_input == "n"):
        user_input = input("Redo analysis (y/n)? ").lower()
    if user_input == "y":
        redo_analysis = True
    else:
        redo_analysis = False

if redo_analysis:
    for FRIEND_NAME in sorted_friends[:num_friends]:
        print(f"Calculating usage rates for: {FRIEND_NAME}")
        term_to_usage_rates = get_term_to_usage_rates_for_friend(FRIEND_NAME)
        friend_to_term_usage_rates[FRIEND_NAME] = term_to_usage_rates

        pickle.dump(friend_to_term_usage_rates, open(friend_to_term_usage_rates_fname, "wb"))
        print(f"{friend_to_term_usage_rates_fname} dumped to disk")
else:
    print(f"{friend_to_term_usage_rates_fname} loaded from disk")
    friend_to_term_usage_rates = pickle.load(open(friend_to_term_usage_rates_fname, "rb"))

In [None]:
# loads term_to_usage_rates from disk and calculate influence scores

friend_to_influence_scores = {}
friend_to_influence_scores_fname = "friend_to_influence_scores"

redo_analysis = True
if os.path.exists(friend_to_term_usage_rates_fname):
    print(f"\n{friend_to_influence_scores_fname} dictionary exists already.")
    user_input = None
    while user_input is None or not (user_input == "y" or user_input == "n"):
        user_input = input("Redo analysis (y/n)? ").lower()
    if user_input == "y":
        redo_analysis = True
    else:
        redo_analysis = False

if redo_analysis:
    for FRIEND_NAME in friend_to_term_usage_rates:
        term_to_usage_rates = friend_to_term_usage_rates[FRIEND_NAME]
        # { 
        #   "FRIEND_NAME": "",
        #   term_1: {
        #     "friend_usage_rates": [ ... ],
        #     "user_usage_rates": [ ... ],
        #     "x_axis": x_axis
        #   },
        #   term_2 : {
        #     ...
        #   }
        # ...
        # }
        term_to_influence_score = {}

        for TERM in term_to_usage_rates:
            if TERM == "FRIEND_NAME":
                print(f"Friend name: {FRIEND_NAME}")
                continue

            friend_usage_rates = np.asarray(term_to_usage_rates[TERM]["friend_usage_rates"])
            user_usage_rates = np.asarray(term_to_usage_rates[TERM]["user_usage_rates"])
            x_axis = np.asarray(term_to_usage_rates[TERM]["x_axis"])

            # added to enable code folding
            if True:
                # difference: negative means friend influenced user
                difference = user_usage_rates - friend_usage_rates

                # Create the base line
                start = x_axis[0]
                stop = x_axis[-1]

                # timestamps in seconds
                x_timestamps = np.array([dt.replace(tzinfo=timezone.utc).timestamp() for dt in x_axis])

                # time range covered by data
                total_time = np.ptp(x_timestamps)
                avg_diff = np.mean(abs(difference))

                slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x=x_timestamps, y=difference)
                user_slope, user_intercept, _, _, _ = scipy.stats.linregress(x=x_timestamps, y=user_usage_rates)
                friend_slope, friend_intercept, _, _, _ = scipy.stats.linregress(x=x_timestamps, y=friend_usage_rates)

                # make all difference values positive
                difference_iqr = scipy.stats.iqr(difference)

                # make sure this value is positive
                if difference_iqr > 0:
                    # the intercept is the value all the way at unix time 0, not when starting
                    # this measure the base difference rate
                    base_difference_rate = np.mean(difference[:max(len(difference // 10), 1)])

                    # normalize the scope to calculate influence score
                    # divide by difference_iqr to normalize values 
                    # multiply by absolute value of base_difference rate to account for whether there is a 
                    # initial difference
                    
                    influence_score = -slope * total_time / abs(np.mean(difference - min(difference))) * abs(base_difference_rate)

        #             tried doing base rates from intercept, not accurate, switching to average over first 10th
        #             user_base_rate = user_intercept + x_timestamps[0] * user_slope
        #             friend_base_rate = friend_intercept + x_timestamps[0] * friend_slope

                    user_base_rate = np.mean(user_usage_rates[:max(len(user_usage_rates // 10), 1)])
                    friend_base_rate = np.mean(friend_usage_rates[:max(len(friend_usage_rates // 10), 1)])

                    term_to_influence_score[TERM] = {"score": influence_score, 
                                                     "user_base_rate": user_base_rate, 
                                                     "friend_base_rate": friend_base_rate}

                    # added to enable code folding
#                     graph = True
                    graph = False
                    if graph:
                        # 0 slope means there was little influence
                        print(f"TERM: {TERM}")
                        print(f"Influence score: {influence_score}")
                        print("user_base_rate:", user_base_rate)
                        print("friend_base_rate:", friend_base_rate)
                        print("base_difference_rate:", base_difference_rate)
        #                 print(f"Your usage rate slope: {user_slope}")
        #                 print(f"Their usage rate slope: {friend_slope}")
        #                 print(f"Diff slope: {slope}")
        #                 print(f"Diff IQR: {difference_iqr}")

                        print("""positive influence score = you influenced the other person\nnegative influence score = the other person influenced you.""")
                        # (aka your usage rate was initially higher than the other person)

                        # negative influence score = the other person influenced you.
                        # (aka your usage rate was initially lower than the other person

                        fig, ax = plt.subplots(nrows=2, figsize=(25, 5))

                        # TODO: set the y_lim dynamically, works well enough for now
        #                 for i in range(len(ax)):
        #                     ax[i].set_ylim(bottom=-0.03, top=0.03)

                        ax[0].plot((start, stop), (0, 0), 'k', alpha=.5)
                        ax[1].plot((start, stop), (0, 0), 'k', alpha=.5)

                        ax[0].set(title=f"Usage rates of {TERM} with {FRIEND_NAME}")
                        ax[0].plot(x_axis, friend_usage_rates, label=f"{FRIEND_NAME}", c='r')
                        ax[0].plot(x_axis, friend_intercept + friend_slope*x_timestamps, c='orange')
                        ax[0].plot(x_axis, user_usage_rates, label=f"{account_user_name}", c='b')
                        ax[0].plot(x_axis, user_intercept + user_slope*x_timestamps, c='green')

        #                 plot user base rate as sanity check
        #                 ax[0].plot((x_axis[0]), (user_base_rate), 'ko', alpha=1, label="int")


                        ax[1].plot(x_axis, difference, label="Difference", c='green')
                        ax[1].plot(x_axis, intercept + slope*x_timestamps, c='red')

                        for i in range(len(ax)):
                            ax[i].get_xaxis().set_major_locator(mdates.MonthLocator(interval=1))
                            ax[i].get_xaxis().set_major_formatter(mdates.DateFormatter("%b %Y"))
                            fig.autofmt_xdate()

                            plt.setp((ax[i].get_yticklabels() + ax[i].get_yticklines() +
                                          list(ax[i].spines.values())), visible=False)

                            ax[i].legend(loc='upper left');
                        plt.show()

        # TODO: graph the z-scores
        normalized_scores = np.asarray([term_to_influence_score[x]["score"] for x in term_to_influence_score])
        normalized_scores = scipy.stats.zscore(normalized_scores)
        counter = 0

        total_term_count = 0
        for TERM in term_to_influence_score:
            total_term_count += sender_to_term_count[friend_name][TERM]

        # print results
        for TERM in term_to_influence_score:
            # don't want to use the normalized scores for viz because normalizing the numbers
            # causes them to lose their sign, and the sign tells you the direction of influence
            # what you actually want to do is compare the standard deviations between different
            # people, so I am just printing out the raw influence scores for now
            cur_term_score = normalized_scores[counter]
            counter += 1

            # normalize by what the affected person's initial usage rate was,
            # if the affected person's initial usage rate was high, the influence score should be low
            # if the affected person's initial usage rate was low, the influence score should be high

            # weight by term count? this doesn't seem to be effective either when multiplying or dividing it
            # varies on a case by case
    #         term_to_influence_score[TERM]["score"] *= sender_to_term_count[friend_name][TERM] * total_term_count

            print(f'{TERM}: {term_to_influence_score[TERM]["score"]}')

        friend_to_influence_scores[FRIEND_NAME] = term_to_influence_score
    pickle.dump(friend_to_influence_scores, open(friend_to_influence_scores_fname, "wb"))
else:
    friend_to_influence_scores = pickle.load(open(friend_to_influence_scores_fname, "rb"))

In [None]:
print(friend_to_influence_scores)