In [4]:
import re
from sklearn.preprocessing import MultiLabelBinarizer
from mlxtend.frequent_patterns import apriori
import pandas as pd
import altair as alt
import numpy as np

def listToString(s): #for cleaning up the lists that the text parsing creates
    str1= " "
    return(str1.join(s))

def enter_the_matrix(df): #creating matrix for apriori 
    emoji_list = df.emojis.unique()
    emoji_set = set(emoji_list)
    df['emojis'] = df.text.apply(lambda text:np.unique([chr for chr in text if chr in emoji_set]))
    mlb = MultiLabelBinarizer()
    emoji_matrix = pd.DataFrame(data=mlb.fit_transform(df.emojis), index=df.index, columns=mlb.classes_)

    return emoji_matrix

def emoji_frequent_itemsets(emoji_matrix, min_support=0.005, k=3): #setting up the apriori calculation
    emoji_matrix_itemset = apriori(emoji_matrix, min_support=min_support, use_colnames=True)
    out = emoji_matrix_itemset[emoji_matrix_itemset['itemsets'].apply(lambda x: len(x)) == k]
    return out

tweets = pd.read_csv('assets/tweets.csv')
tweets['emojis'] = tweets['text'].str.findall(r'[^\w\s.,"@\'?/#!$%\^&\*;:{}=\-_`~()\U0001F1E6-\U0001F1FF]').str.len()
boxer_emojis = ['☘️','🇮🇪','🍀','💸','🤑','💰','💵','😴','😂','🤣','🥊','👊','👏','🇮🇪','💪','🔥','😭','💰']
for emoji in boxer_emojis:
    tweets[emoji] = tweets.text.str.count(emoji)
tweets['irish_pride'] = tweets['☘️'] + tweets['🇮🇪'] + tweets['🍀']
tweets['money_team'] = tweets['💸'] + tweets['🤑'] + tweets['💰'] +  tweets['💵']

tweets['datetime'] = pd.to_datetime(tweets['created_at'])
tweets = tweets.set_index('datetime')

tweet_copy = tweets.copy()
tweet_copy2 = tweets.copy()
tweets_df = tweet_copy
boxer_df = tweet_copy2

tweet_copy['emojis'] = tweets['text'].str.findall(r'[^\w\s.,"@\'?/#!$%\^&\*;:{}=\-_`~()\U0001F1E6-\U0001F1FF]')
tweet_copy['emojis']= tweet_copy['emojis'].apply(lambda x: listToString(x))

tweet_copy2['emojis'] = tweets['text'].str.findall(str(boxer_emojis))
tweet_copy2['emojis']= tweet_copy2['emojis'].apply(lambda x: listToString(x))

tweet_all = enter_the_matrix(tweet_copy)
boxer = enter_the_matrix(tweet_copy2)

tweet_all.reset_index(inplace=True)
tweet_all.drop('datetime', axis=1, inplace=True)

boxer.reset_index(inplace=True)
boxer.drop('datetime', axis=1, inplace=True)

tweet_all_frequent_3itemsets = emoji_frequent_itemsets(tweet_all, min_support=0.0005, k=3) #supports need to be different for the item sets as the "boxer" emojis have less data overall
boxer_frequent_3itemsets = emoji_frequent_itemsets(boxer, min_support=0.00001, k=3)
boxer_frequent_3itemsets =boxer_frequent_3itemsets.loc[60:]#keeping only 3 itemsets
tweet_all_frequent_2itemsets = emoji_frequent_itemsets(tweet_all, min_support=0.0025, k=2)
boxer_frequent_2itemsets = emoji_frequent_itemsets(boxer, min_support=0.0005, k=2)
boxer_frequent_2itemsets = boxer_frequent_2itemsets.loc[19:] # keeping only 2 itemsets


tweet_all_frequent_3itemsets["itemsets"] = tweet_all_frequent_3itemsets["itemsets"].apply(lambda x: list(x)).astype("unicode")
boxer_frequent_3itemsets["itemsets"] = boxer_frequent_3itemsets["itemsets"].apply(lambda x: list(x)).astype("unicode")
tweet_all_frequent_2itemsets["itemsets"] = tweet_all_frequent_2itemsets["itemsets"].apply(lambda x: list(x)).astype("unicode")
boxer_frequent_2itemsets["itemsets"] = boxer_frequent_2itemsets["itemsets"].apply(lambda x: list(x)).astype("unicode")

chart_all_3 = alt.Chart(tweet_all_frequent_3itemsets).mark_bar(size=10, color='#195190FF').encode(
    x=alt.X('support:Q', title='Support: 3 Itemsets'),
    y=alt.Y('itemsets:N', sort=alt.EncodingSortField(
            field="support",  
            order="ascending"  
            )),
)
annotation_all_3 = alt.Chart(tweet_all_frequent_3itemsets).mark_text(
    align='left',
    baseline='middle',
    lineBreak='\n',
    fontSize = 14
).encode(
    x='support:Q',
    y=alt.Y('itemsets:N', axis=None, sort=alt.EncodingSortField(
            field="support",  
            order="ascending"  
            )),
    text = 'itemsets'
)
chart_box_3 = alt.Chart(boxer_frequent_3itemsets).mark_bar(size=10, color = '#A9A9A9').encode(
    x=alt.X('support:Q'),
    y=alt.Y('itemsets:N', sort=alt.EncodingSortField(
            field="support",  
            order="ascending"  
            )),
)
annotation_box_3 = alt.Chart(boxer_frequent_3itemsets).mark_text(
    align='left',
    baseline='middle',
    lineBreak='\n',
    fontSize = 14
).encode(
    x='support:Q',
    y=alt.Y('itemsets:N', axis=None, sort=alt.EncodingSortField(
            field="support",  
            order="ascending"  
            )),
    text = 'itemsets'
)
chart_all_2 = alt.Chart(tweet_all_frequent_2itemsets).mark_bar(size=10, color='#195190FF').encode(
    x=alt.X('support:Q'),
    y=alt.Y('itemsets:N', sort=alt.EncodingSortField(
            field="support",  
            order="ascending"  
            )),
)
annotation_all_2 = alt.Chart(tweet_all_frequent_2itemsets).mark_text(
    align='left',
    baseline='middle',
    lineBreak='\n',
    fontSize = 14
).encode(
    x='support:Q',
    y=alt.Y('itemsets:N', axis=None, sort=alt.EncodingSortField(
            field="support",  
            order="ascending"  
            )),
    text = 'itemsets'
)
chart_box_2 = alt.Chart(boxer_frequent_2itemsets).mark_bar(size=10, color = '#A9A9A9').encode(
    x=alt.X('support:Q', title = 'Support: 2 Itemsets'),
    y=alt.Y('itemsets:N', sort=alt.EncodingSortField(
            field="support",  
            order="ascending"  
            )),
)
annotation_box_2 = alt.Chart(boxer_frequent_2itemsets).mark_text(
    align='left',
    baseline='middle',
    lineBreak='\n',
    fontSize = 14
).encode(
    x='support:Q',
    y=alt.Y('itemsets:N', axis=None, sort=alt.EncodingSortField(
            field="support",  
            order="ascending"  
            )),
    text = 'itemsets'
)
Title = alt.Chart(
    {"values": [{"text": ['The most common tweet itemsets for all vs "boxer" emojis']}]}
).mark_text(size=24, color='black', lineBreak='/n', align='left', dx=-50, fontStyle='bold').encode(
    text="text:N"
)
subtitle = alt.Chart(
    {"values": [{"text": ['Legend: ']}]}
).mark_text(size=16, color='black', lineBreak='/n', dx=-50,align='left').encode(
    text="text:N"
)
subtitle2 = alt.Chart(
    {"values": [{"text": ['▉ Itemsets - All Emojis']}]}
).mark_text(size=16, color='#195190FF', lineBreak='/n', dx=-50,align='left').encode(
    text="text:N"
)
subtitle3 = alt.Chart(
    {"values": [{"text": ['▉ Itemsets - Boxer Emojis']}]}
).mark_text(size=16, color='#A9A9A9', lineBreak='/n', dx=-50,align='left').encode(
    text=alt.Text("text:N")
)
subtitle4 = alt.Chart(
    {"values": [{"text": ['Tweets were divided into "itemsets" (3-set and 2-set) to evaluate if certain combinations were more prevelant in the /n "Boxer" defined group vs all.']}]}
).mark_text(size=16, color='grey', lineBreak='/n', dx=-50,align='left').encode(
    text=alt.Text("text:N")
)

line = alt.Chart(
    {"values": [{"text": ['_____________________________________________________________________________________']}]}
).mark_text(size=16, color='black', fontStyle='bold', dx=-50, dy=-50, align='left').encode(
    text=alt.Text("text:N")
)
chart1 = (chart_all_3+annotation_all_3)+(chart_box_3+annotation_box_3)
chart2 = (chart_all_2+annotation_all_2)+(chart_box_2+annotation_box_2)
charts = (chart1|chart2)
alt.vconcat(Title, (subtitle|subtitle2|subtitle3),subtitle4,line,charts,background = '#F0F0F0'
           ).configure_axis(
    grid=False,
).configure_view(
    strokeWidth=0,strokeOpacity=0
)