In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from mlxtend.frequent_patterns import apriori

In [None]:
# create jaaccard similarity function
def jaccard_similarity(set_a, set_b):
    intersection = len(set(set_a).intersection(set_b))
    union = (len(set_a) + len(set_b)) - intersection
    return float(intersection) / union

# create frequent itemset function
def emoji_frequent_itemsets(emoji_matrix, min_support=0.005, k=3):
    apriori_emoji = apriori(emoji_matrix, min_support=min_support, use_colnames=True)
    return apriori_emoji[apriori_emoji['itemsets'].apply(lambda x: len(x)) == k]

tweets = pd.read_csv('tweets.csv')
main_tweet = '💸🤑💰💵🥊👊🔥💰'
main_tweet_set = set(main_tweet)

In [None]:
# get emojis from main tweet
tweets['emojis'] = tweets.text.apply(lambda text:np.unique([chr for chr in text if chr in main_tweet_set]))
# determine jaccard similarity between main tweet and sample tweet
tweets['jaccard'] = tweets.emojis.apply(lambda x:jaccard_similarity(main_tweet_set, set(x)))
tweets.sort_values('jaccard',ascending=False).head(n=10)

In [None]:
# apriori frequent 3-itemsets
emojis = ['☘️','🇮🇪','🍀','💸','🤑','💰','💵','😴','😂','🤣','🥊','👊','👏','🇮🇪','💪','🔥','😭','💰']
emoji_set = set(emojis)

# get emojis from emoji tweet
tweets['emojis'] = tweets.text.apply(lambda text:np.unique([chr for chr in text if chr in emoji_set]))

# create emoji matrix
mlb = MultiLabelBinarizer()
emoji_matrix = pd.DataFrame(data=mlb.fit_transform(tweets.emojis), index=tweets.index, columns=mlb.classes_)
emoji_frequent_itemsets(emoji_matrix, min_support=0.005, k=3)