# Project

Welcome to the group project! The project is based on the [ACM RecSys 2021 Challenge](https://recsys-twitter.com/).

- Detailed information about the task, submission and grading can be found in a [dedicates site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1217340).
- Information about the dataset structure [on this site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1218810).

In [49]:
team_name = "team_15" # your team name e.g. 'team_1'
team_members = [("",""),
                ("","")] # [("Jane Doe","012345678"), ("John Doe","012345678")]


In [46]:
print(team_name)
print(team_members)

team_15
[('', ''), ('', '')]


In [50]:
path_to_data = 'data/project/training/'
dataset_type = 'one_hour' # all_sorted, one_day, one_hour, one_week

In [51]:
try:
    import pandas as pd
except:
    if hasattr(sys, 'real_prefix'):
        #we are in a virtual env.
        !pip3 install pandas
    else:
        !pip3 install --user pandas

In [25]:
import os
import re
import csv
import datetime

from model import reply_pred_model, retweet_pred_model, quote_pred_model, fav_pred_model 

all_features = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "enaging_user_follower_count", "enaging_user_following_count", "enaging_user_is_verified",\
               "enaging_user_account_creation", "engagee_follows_engager", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]

all_features_to_idx = dict(zip(all_features, range(len(all_features))))

def parse_input_line(line):
    features = line #.split("\x01")
    tweet_id = features[all_features_to_idx['tweet_id']]
    user_id = features[all_features_to_idx['engaging_user_id']]
    input_feats = features[all_features_to_idx['text_tokens']]
    tweet_timestamp = features[all_features_to_idx['tweet_timestamp']]
    
    return tweet_id, user_id, input_feats, tweet_timestamp



def evaluate_test_set():
    expanded_path = os.path.expanduser(path_to_data)
    part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
    part_files = sorted(part_files, key = lambda x:x[-5:]) 
        
    with open('results.csv', 'w') as output:
        for file in part_files:
            with open(file, 'r') as f:
                linereader = csv.reader(f, delimiter='\x01')
                last_timestamp = None
                for row in linereader:
                    tweet_id, user_id, features, tweet_timestamp = parse_input_line(row)                                       
                    reply_pred = reply_pred_model(features) # reply_model
                    retweet_pred = retweet_pred_model(features) # retweet_model
                    quote_pred = quote_pred_model(features) # pred_model
                    fav_pred = fav_pred_model(features) # fav_model
                    
                    # print(str(tweet_timestamp))
                    # print(str(reply_pred)+" "+str(retweet_pred)+" "+str(quote_pred)+" "+str(fav_pred))
                    
                    output.write(f'{tweet_id},{user_id},{reply_pred},{retweet_pred},{quote_pred},{fav_pred}\n')


In [52]:
def load_data(filename):
        data = pd.read_csv(filename, sep='\x01', names=all_features, index_col=False)
        return data
    

In [53]:
# evaluate_test_set()

In [54]:
data = load_data(path_to_data + dataset_type)

# We choose first 5k rows in order to work faster with the data
data = data.head(5000)

In [56]:
data.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,101\t100\t100\t100\t100\t100\t100\t100\t100\t1...,,395A05A1E8A0A4CEB2E623281C7A41EE,,,,TopLevel,313ECD3A1E5BB07406E4249475C2D6D6,1614207600,55F619B7474C1BA0C8EE03C2A31C534C,6567,4006,False,1456262411,736278C2FEC488516CDA4ED6952A2154,1139,1126,False,1601425426,False,,,,1614208000.0
1,101\t100\t100\t100\t100\t100\t216\t216\t10243\...,,81E8247F4E74A0FCDBA911E1A3CB5412,Photo,758E6F75A253992C7070F6B8A8A891A6,6B7D92057ACA0F97EFB5B724D3C963E4,TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,1614207600,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,19D5367D835484236CAF9DBEF475FF7A,82,76,False,1495813718,False,,,,
2,101\t100\t100\t100\t100\t100\t216\t216\t10243\...,,81E8247F4E74A0FCDBA911E1A3CB5412,Photo,758E6F75A253992C7070F6B8A8A891A6,6B7D92057ACA0F97EFB5B724D3C963E4,TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,1614207600,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,40BEB04CF8D3CB02449879668656FFDB,108,351,False,1506038593,False,,,,
3,101\t100\t100\t100\t100\t100\t216\t216\t10243\...,,81E8247F4E74A0FCDBA911E1A3CB5412,Photo,758E6F75A253992C7070F6B8A8A891A6,6B7D92057ACA0F97EFB5B724D3C963E4,TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,1614207600,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,6415C94D3C27BA84C069DE049EBB3EDE,69,106,False,1562533868,False,,,,
4,101\t100\t100\t100\t100\t100\t216\t216\t10243\...,,81E8247F4E74A0FCDBA911E1A3CB5412,Photo,758E6F75A253992C7070F6B8A8A891A6,6B7D92057ACA0F97EFB5B724D3C963E4,TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,1614207600,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,7E614D5881BC18768880CC374C4BE821,24,180,False,1302057914,False,,,,


In [58]:
def columns_to_list(data, columns):
    
    for col in columns:
        data[col] = data[col].str.split('\t')
    return data


def columns_to_timestamps(data, columns):
    for col in columns:  
        data[col] = data[col].apply(lambda x: pd.Timestamp(x, unit='s'))
        
    return data
    
cols_to_list = ['text_tokens', 'hashtags', 'present_media', 'present_links', 'present_domains']
data = columns_to_list(data, cols_to_list)    

cols_to_timestamps = ['tweet_timestamp', 'enaging_user_account_creation', 'reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
data = columns_to_timestamps(data, cols_to_timestamps)  

In [23]:
pd.set_option('display.max_columns', None)
print(data.shape)
display(data.head(50))

(5000, 24)


Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,"[101, 100, 100, 100, 100, 100, 100, 100, 100, ...",,395A05A1E8A0A4CEB2E623281C7A41EE,,,,TopLevel,313ECD3A1E5BB07406E4249475C2D6D6,2021-02-24 23:00:00,55F619B7474C1BA0C8EE03C2A31C534C,6567,4006,False,1456262411,736278C2FEC488516CDA4ED6952A2154,1139,1126,False,2020-09-30 00:23:46,False,NaT,NaT,NaT,2021-02-24 23:05:04
1,"[101, 100, 100, 100, 100, 100, 216, 216, 10243...",,81E8247F4E74A0FCDBA911E1A3CB5412,[Photo],[758E6F75A253992C7070F6B8A8A891A6],[6B7D92057ACA0F97EFB5B724D3C963E4],TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,2021-02-24 23:00:00,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,19D5367D835484236CAF9DBEF475FF7A,82,76,False,2017-05-26 15:48:38,False,NaT,NaT,NaT,NaT
2,"[101, 100, 100, 100, 100, 100, 216, 216, 10243...",,81E8247F4E74A0FCDBA911E1A3CB5412,[Photo],[758E6F75A253992C7070F6B8A8A891A6],[6B7D92057ACA0F97EFB5B724D3C963E4],TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,2021-02-24 23:00:00,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,40BEB04CF8D3CB02449879668656FFDB,108,351,False,2017-09-22 00:03:13,False,NaT,NaT,NaT,NaT
3,"[101, 100, 100, 100, 100, 100, 216, 216, 10243...",,81E8247F4E74A0FCDBA911E1A3CB5412,[Photo],[758E6F75A253992C7070F6B8A8A891A6],[6B7D92057ACA0F97EFB5B724D3C963E4],TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,2021-02-24 23:00:00,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,6415C94D3C27BA84C069DE049EBB3EDE,69,106,False,2019-07-07 21:11:08,False,NaT,NaT,NaT,NaT
4,"[101, 100, 100, 100, 100, 100, 216, 216, 10243...",,81E8247F4E74A0FCDBA911E1A3CB5412,[Photo],[758E6F75A253992C7070F6B8A8A891A6],[6B7D92057ACA0F97EFB5B724D3C963E4],TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,2021-02-24 23:00:00,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,7E614D5881BC18768880CC374C4BE821,24,180,False,2011-04-06 02:45:14,False,NaT,NaT,NaT,NaT
5,"[101, 100, 100, 100, 100, 100, 216, 216, 10243...",,81E8247F4E74A0FCDBA911E1A3CB5412,[Photo],[758E6F75A253992C7070F6B8A8A891A6],[6B7D92057ACA0F97EFB5B724D3C963E4],TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,2021-02-24 23:00:00,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,9CFE74AF494F669C35496B000C5843AB,269,494,False,2011-11-11 23:21:18,False,NaT,NaT,NaT,NaT
6,"[101, 100, 100, 100, 100, 100, 216, 216, 10243...",,81E8247F4E74A0FCDBA911E1A3CB5412,[Photo],[758E6F75A253992C7070F6B8A8A891A6],[6B7D92057ACA0F97EFB5B724D3C963E4],TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,2021-02-24 23:00:00,9B49D384D56A65E00A12D8349EB46CE5,47951,681,True,1305833605,B5FAB4F5EC1B12BE7E7FE0C27662EB80,770,1323,False,2010-06-18 17:25:31,True,NaT,NaT,NaT,2021-02-24 23:42:41
7,"[101, 100, 100, 11045, 12589, 10545, 27920, 71...",[2C8377CDAFF61B5431417E4615B5C079],40729594A0EC3DB7D690C6789EC11992,[Video],,,TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,2021-02-24 23:00:00,13F0E6E89E19B4E2F6C6F881E6B5B35E,4428,376,False,1454502209,BDAFE5407EF4AA3E63E12B8074CA6951,201,1322,False,2010-08-03 04:23:42,False,NaT,NaT,NaT,NaT
8,"[101, 100, 100, 216, 216, 137, 10183, 10115, 1...",,E5DE595BFF4F548CE9CD32B599FAB516,[Video],,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-24 23:00:00,193A666BBC290075BB8E78672D0153A6,175968,161,False,1569911252,11612C81933A4F93C5F985AAFE3E04CB,47,356,False,2019-08-21 07:17:56,False,NaT,NaT,NaT,2021-02-24 23:07:39
9,"[101, 100, 100, 216, 216, 137, 10183, 10115, 1...",,E5DE595BFF4F548CE9CD32B599FAB516,[Video],,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-24 23:00:00,193A666BBC290075BB8E78672D0153A6,175968,161,False,1569911252,218A62BB3B0650FBFC37DC7E4B4A4634,379,4867,False,2016-11-07 14:21:02,False,NaT,2021-02-24 23:33:22,NaT,2021-02-24 23:33:22


# Splitting dataset into train and test
### Splitting the training set - one hour into train and test data. The training dataset is used for model training and the test dataset for testing the trained model

In [60]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size= 0.20, random_state=42)

In [61]:
train_data.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
4227,"[101, 164, 100, 166, 108, 9920, 22695, 118639,...","[6AFBC2EA7EA420238782D7171B8DB196, 8770FD4809F...",8F42712998B2B3478CECE7F563745217,,[AA360FF141A5AD9C94E7D11F32ADD5CF],[BAED74433A3AC529FBEDC13AEA9ABB9F],TopLevel,8729EBF694C3DAF61208A209C2A542C8,2021-02-24 23:00:00,8E4072EA3A231189B04F9533B703E427,1080773,5,True,1455331389,B4168346F4A5F98F16172CAEA3F72F26,1,27,False,2020-10-14 04:00:01,False,NaT,NaT,NaT,2021-02-24 23:46:29
4676,"[101, 19561, 78582, 10173, 59908, 10129, 10427...",[08C8AC8E3182B4449459D07C1E0CB10F],B9F0FC7AB8CAE5309E98AA25F77EEE50,"[Photo, Photo]",[DA728503CD26B06E8ADD6689EEF16A84],[9EFF000CDB18B710CDDB43EE1D8C300B],TopLevel,B8B04128918BBF54E2E178BFF1ABA833,2021-02-24 23:00:00,5BA4C626FF96609F2F90D0DA2ACF0CE1,493300,49,True,1235937074,82572C96C521E52C08779796EA626AA1,868,1647,False,2012-12-21 17:34:28,False,NaT,NaT,NaT,NaT
800,"[101, 10067, 216, 4476, 4348, 1904, 51143, 203...","[D1139501BFB8AFE20E2FEFFDB04E92BD, 978B0C262F5...",0397E6D2EB9382DC320FDA29E84F1B99,[Photo],[09DA8D06C2B901EEFA4BAAB3F9314615],[9EFF000CDB18B710CDDB43EE1D8C300B],TopLevel,E7F038DE3EAD397AEC9193686C911677,2021-02-24 23:00:00,8410955CFC9F8F5FE3E64178F95C6A52,5556387,186813,True,1261912560,67C688AC9C5E2FCFCEA1C73D1C6B83C6,14,296,False,2020-05-25 03:52:44,False,NaT,NaT,NaT,NaT
3671,"[101, 14120, 131, 120, 120, 188, 119, 11170, 1...",,6A1C4EE4108877F6BCD87D5BD2D8AF26,[Photo],,,TopLevel,313ECD3A1E5BB07406E4249475C2D6D6,2021-02-24 23:00:00,DE7E95F71B692820B41582854C6D26D9,132396,85404,False,1287557702,C50501518A319B6374C8562FC1805DBE,560,913,False,2011-07-25 18:21:31,True,NaT,NaT,NaT,NaT
4193,"[101, 164, 100, 166, 108, 9920, 22695, 118639,...","[6AFBC2EA7EA420238782D7171B8DB196, 8770FD4809F...",8F42712998B2B3478CECE7F563745217,,[AA360FF141A5AD9C94E7D11F32ADD5CF],[BAED74433A3AC529FBEDC13AEA9ABB9F],TopLevel,8729EBF694C3DAF61208A209C2A542C8,2021-02-24 23:00:00,8E4072EA3A231189B04F9533B703E427,1080773,5,True,1455331389,6DA04CF150C456CFA2AC1B5AB9E36206,43,837,False,2017-06-16 14:59:24,False,NaT,NaT,NaT,NaT


In [62]:
test_data.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
1501,"[101, 100, 67267, 10954, 9487, 70122, 10954, 9...","[E8BF28DB9E35E11F583385EDEB987D32, E8BF28DB9E3...",DCEF6C06DDE77C2DBE7F0BE99B95120A,[Video],,,TopLevel,8729EBF694C3DAF61208A209C2A542C8,2021-02-24 23:00:00,803A7C3606AD920145B31C31171B7715,644319,0,True,1467980494,9A594A6456C8BBD7AAB050760D699FFB,38,358,False,2019-05-20 11:16:42,False,NaT,NaT,NaT,NaT
2586,"[101, 108, 3173, 2419, 1940, 3192, 1904, 89052...","[3B46DE7489E20CDA664914EF9433BA34, ED25F9630C9...",E4154938E6B3A6568E36D016C131F134,,"[020C3CB279E7F5D0C893ECC9703A6994, 3B2A6141900...","[486DE623F8A0527422E6A69147D20C48, 486DE623F8A...",TopLevel,E7F038DE3EAD397AEC9193686C911677,2021-02-24 23:00:00,D0C822B578634E48F9B810B24B717B8D,161493,48,True,1581130623,D09D84B41D8B9F882F211BD879C0AC51,0,39,False,2020-06-23 09:14:22,False,NaT,NaT,NaT,NaT
2653,"[101, 108, 60844, 18749, 15346, 37408, 40218, ...",[21490D601842DDD6FF638F6FCD0525F8],58D87EB41D1B57BDBE6D7B1B4B71A116,,[0D9E445DDECB3705F2DC6E46BB9DE79A],[403B1EE3CF4C38844B00DEA0B610E274],TopLevel,9A78FC330083E72BE0DD1EA92656F3B5,2021-02-24 23:00:00,0012CE19CDFC95AD77327BD0987A6AD0,385046,410,True,1332423674,C84F53B674ED742113B8734EBEEB2A78,48,262,False,2020-05-17 19:01:11,False,NaT,NaT,NaT,NaT
1055,"[101, 10067, 216, 4476, 4348, 1904, 51143, 203...","[D1139501BFB8AFE20E2FEFFDB04E92BD, 978B0C262F5...",0397E6D2EB9382DC320FDA29E84F1B99,[Photo],[09DA8D06C2B901EEFA4BAAB3F9314615],[9EFF000CDB18B710CDDB43EE1D8C300B],TopLevel,E7F038DE3EAD397AEC9193686C911677,2021-02-24 23:00:00,8410955CFC9F8F5FE3E64178F95C6A52,5556387,186813,True,1261912560,B5F3449E189E24D9CA6F9BA1B412B3C6,22,145,False,2018-08-03 07:07:45,False,NaT,NaT,NaT,NaT
705,"[101, 10067, 216, 4476, 4348, 1904, 51143, 203...","[D1139501BFB8AFE20E2FEFFDB04E92BD, 978B0C262F5...",0397E6D2EB9382DC320FDA29E84F1B99,[Photo],[09DA8D06C2B901EEFA4BAAB3F9314615],[9EFF000CDB18B710CDDB43EE1D8C300B],TopLevel,E7F038DE3EAD397AEC9193686C911677,2021-02-24 23:00:00,8410955CFC9F8F5FE3E64178F95C6A52,5556387,186813,True,1261912560,497968B2C32F7C99C1ED7EE7865B6DDA,281,342,False,2014-03-15 04:54:22,False,NaT,NaT,NaT,NaT


# Evaluation

In [66]:

def true_timestamp(t):
    return int(not pd.isnull(t))

def labels(j):
    to_copy = test_data.copy()
    to_copy['labed'] = to_copy.apply(lambda row: true_timestamp(row[j]), axis=1)
    return to_copy[['tweet_id', 'engaging_user_id', 'labed']]

def read_predictions(file):
    filename = os.path.basename(file)
    #print(filename)     
    if (filename.startswith('gt')):
        to_sort = pd.read_csv(file, names=['tweet_id', 'engaging_user_id', 'labed'], header=0)
        sort = to_sort.sort_values(['tweet_id', 'engaging_user_id', 'labed'])
    elif (filename.startswith('pred')):
         to_sort = pd.read_csv(file, names=['tweet_id', 'engaging_user_id', 'prediction'], header=0)
         sort = to_sort.sort_values(['tweet_id', 'engaging_user_id', 'prediction'])
    return sort


#ground truth for retweet
gt_retweet = labels('retweet_timestamp')
gt_retweet.to_csv('gt_retweet.csv')
print(read_predictions('gt_retweet.csv')[:10])

#ground truth for reply
gt_reply = labels('reply_timestamp')
gt_reply.to_csv('gt_reply.csv')
print(read_predictions('gt_reply.csv')[:10])

#ground truth for like
gt_like = labels('like_timestamp')
gt_like.to_csv('gt_like.csv')
print(read_predictions('gt_like.csv')[:10])

#ground truth for retweet with comment
gt_rc = labels('retweet_with_comment_timestamp')
gt_rc.to_csv('gt_rc.csv')
print(read_predictions('gt_rc.csv')[:10])



                              tweet_id                  engaging_user_id  \
2164  00F23FACF2C4F78E32E86C0E60971078  CC9AAACEEC69EAC26ED1FE87409C4440   
415   0150620EF457986E1DEE422F927800FE  05CDF85F45D64F780E85EDB2ADE92D10   
416   0150620EF457986E1DEE422F927800FE  1048E3C6F4F100D78534BF4256EC6B7D   
422   0150620EF457986E1DEE422F927800FE  56850B35358D2D80FCE86007F10B90AE   
426   0150620EF457986E1DEE422F927800FE  74482542603FB06A170521DEBAB0EB0D   
429   0150620EF457986E1DEE422F927800FE  A8BB2C43DECC27082A19AB2D1D3F25E6   
438   0150620EF457986E1DEE422F927800FE  F7DF6F89281A2B6F3BACF5233A8679FF   
33    0195E193FB3865120F97DE561EACFD44  009BB83DD910B2936E9751555B77DCC5   
3725  0313A0E551683BBAEF003AA4B19E354E  28003C54CE16CE96079855EEA77B0DB3   
4797  037243024DB46623320369D3381FBB93  A2B44695A0E895AB97AF94BED0150C2D   

      labed  
2164      0  
415       0  
416       0  
422       0  
426       0  
429       0  
438       0  
33        0  
3725      0  
4797      0  
         

# Create a Ratings Matrix
### One ratings matrix for each engagement type 

In [67]:
#creating a data frame for the unique tweets and a unique one for the engagement between users
uTID = data['tweet_id'].unique()
uTID.sort()

uUID = data['engaging_user_id'].append(data['engaged_with_user_id']).unique()
uUID.sort()

m = len(uUID)
n = len(uTID)

#creating internal ids for the users and the tweets
userId_to_userIDX = dict(zip(uUID, range(m)))
userIDX_to_userId = dict(zip(range(m), uUID))

tweetId_to_tweetIDX = dict(zip(uTID, range(n)))
tweetIDX_to_tweetId = dict(zip(range(n), uTID))

In [40]:
#creating a dataframe for the upcoming implementation of the ratings matrix 
j = ['tweet_id', 'engaging_user_id', 'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp']

ratings = pd.concat([data['engaging_user_id'].map(userId_to_userIDX),
                  data['tweet_id'].map(tweetId_to_tweetIDX),
                  data['reply_timestamp'].notnull(),
                  data['retweet_timestamp'].notnull(),
                  data['retweet_with_comment_timestamp'].notnull(),
                  data['like_timestamp'].notnull()], axis = 1)

ratings.columns = ['user', 'tweet', 'reply', 'retweet', 'retweet_with_comment', 'like']
ratings.sort_values(['user', 'tweet'], inplace = True)

ratings.head(n = 20)

Unnamed: 0,user,tweet,reply,retweet,retweet_with_comment,like
3271,1,339,False,False,False,False
441,2,12,False,False,False,False
1980,3,530,False,False,False,True
3555,5,413,False,False,False,False
105,6,621,False,False,False,False
1857,7,86,False,False,False,False
4660,10,508,False,False,False,False
33,12,7,False,False,False,False
442,13,12,False,False,False,False
443,14,12,False,False,False,False


In [68]:
from scipy import sparse as sp

#creating the ratings matrices

RM_reply = sp.csr_matrix((ratings.reply[ratings.reply], (ratings.user[ratings.reply], ratings.tweet[ratings.reply])), 
            shape=(m, n))

RM_retweet = sp.csr_matrix((ratings.retweet[ratings.retweet], (ratings.user[ratings.retweet], ratings.tweet[ratings.retweet])), 
            shape=(m, n))

RM_retweet_wc = sp.csr_matrix((ratings.retweet_with_comment[ratings.retweet_with_comment], (ratings.user[ratings.retweet_with_comment]             , ratings.tweet[ratings.retweet_with_comment])), shape=(m, n))

RM_like = sp.csr_matrix((ratings.like[ratings.like], (ratings.user[ratings.like], ratings.tweet[ratings.like])), 
            shape=(m, n))

display(RM_reply.shape, RM_reply.count_nonzero())
display(RM_retweet.shape, RM_retweet.count_nonzero())
display(RM_retweet_wc.shape, RM_retweet_wc.count_nonzero())
display(RM_like.shape, RM_like.count_nonzero())

(5680, 707)

41

(5680, 707)

186

(5680, 707)

29

(5680, 707)

830

# User-User Similarity

In [69]:
from scipy.sparse.linalg import norm

def compute_pairwise_user_similarity(u_id, v_id, RM_type):
    u = RM_type[u_id,:].copy()
    v = RM_type[v_id,:].copy()
    
    #cosine similarity formula from the slides based on the vector operations defined above
    numerator = u.dot(v.T).A.item()
    denominator = norm(u)*norm(v)
    
    if denominator == 0:
        similarity = 0.;
    else: 
        similarity = numerator/denominator
    
    return similarity

In [145]:
#testing the function above
display(compute_pairwise_user_similarity(15, 5256, RM_reply))
display(compute_pairwise_user_similarity(5256, 1642, RM_retweet))
display(compute_pairwise_user_similarity(1642, 5422, RM_retweet_wc))
display(compute_pairwise_user_similarity(5422, 15, RM_like))

0.0

0.0

0.0

0.0

# User to all Users Similarity

In [74]:
import numpy as np

def compute_user_similarities(u_id, RM_type):
    uU = np.empty((m,))

    #computing similarities of user u_id with all of the other users
    for v_id in range(m):
        uU[v_id] = compute_pairwise_user_similarity(u_id, v_id, RM_type)
    
    return uU

In [148]:
# Test 
uU = compute_user_similarities(15, RM_reply)
display(uU[1])

uU = compute_user_similarities(5256, RM_retweet)
display(uU[50])

uU = compute_user_similarities(1642, RM_retweet_wc)
display(uU[10])

uU = compute_user_similarities(5422, RM_like)
display(uU[10])

0.0

0.0

0.0

0.0

#  User Neighbourhood

In [123]:
#transforming from sparse matrix to dictionary of keys for easier handling
RM_reply_dok = RM_reply.todok()
RM_retweet_dok = RM_retweet.todok()
RM_retweet_wc_dok = RM_retweet_wc.todok()
RM_like_dok = RM_like.todok()

k = 10

def create_user_neighborhood(u_id, i_id, RM_type, RM_type_dok):
    nh = {} ## the neighborhood dict with (user id: similarity) entries
    ## nh should not contain u_id and only include users that have rated i_id; there should be at most k neighbors
    uU = compute_user_similarities(u_id, RM_type)
    uU_copy = uU.copy() ## so that we can modify it, but also keep the original
    
    sorted_values = np.argsort(uU_copy)[::-1]    
        
   #counter for k neighbours
    ik = 0 
    for i in sorted_values:
        # checking if i gave a rating to item i_id and making sure i is different from itself
        if (i, i_id) in RM_type_dok and i!=u_id:
            nh[i] = uU_copy[i]
            ik+=1
        if ik == k:
            break

    return nh

In [142]:
# Test neighborhood

nh = create_user_neighborhood(15, 595, RM_reply, RM_reply_dok)
display(nh)

nh = create_user_neighborhood(5256, 437, RM_retweet, RM_retweet_dok)
display(nh)

nh = create_user_neighborhood(1642, 27, RM_retweet_wc, RM_retweet_wc_dok)
display(nh)

nh = create_user_neighborhood(5422, 609, RM_like, RM_like_dok)
display(nh)

{}

{3197: 1.0}

{}

{}

# Predict Ratings

In [138]:
def predict_internal_ids(u_id, i_id, RM_type, RM_type_dok):

    if (u_id, i_id) in RM_type_dok:
        print("user", u_id, "has engaged with item", i_id, "with", RM_type[u_id, i_id])
    else:
        print("user", u_id, "has not engaged with item", i_id)
        print("k:", k)


    nh = create_user_neighborhood(u_id, i_id, RM_type, RM_type_dok)

    neighborhood_weighted_avg = 0.
    numerator = 0.
    denominator = 0.

    for v in nh.items():
        numerator += nh[v] * RM_type[v,i_id]

        denominator += np.absolute(nh[v])


    if denominator == 0:
        neighborhood_weighted_avg = 0.;
    else:
        neighborhood_weighted_avg = numerator/denominator


    prediction = neighborhood_weighted_avg

    return prediction

In [139]:
#test
predict_internal_ids(15, 595, RM_reply, RM_reply_dok)

user 15 has interacted on item 595 with True


0.0

In [140]:
def predict_external_ids(tweet_id, engaging_user_id, RM_type, RM_type_dok):
    print("user", engaging_user_id, "has internal id ", userId_to_userIDX[engaging_user_id])
    print("tweet", tweet_id, "has internal id ", tweetId_to_tweetIDX[tweet_id])
    return predict_internal_ids(userId_to_userIDX[engaging_user_id],tweetId_to_tweetIDX[tweet_id], RM_type, RM_type_dok)

In [141]:
#testing different external ids 

print("Reply")
predict_external_ids("DCEF6C06DDE77C2DBE7F0BE99B95120A", "2284A3F835F7156B2F432B82D8963D27", RM_reply, RM_reply_dok)

print("")
print("Retweet")
predict_external_ids("A3B8BEF795136AAA9E25B5173E80A73D", "EBBE15EB3C30A275BF87E7B9A676D12F", RM_retweet, RM_retweet_dok)

print("")
print("Retweet with Comment")
predict_external_ids("089FE87D98654DA3323FE87552B86965", "48918F9BDF36C80185112EF228F1429F", RM_retweet_wc, RM_retweet_wc_dok)

print("")
print("Like")
predict_external_ids("DE1604F4816F6B8BD85A9478AE9D32E9", "F343F23E25FF1D7041E31E0CF4D026AD", RM_like, RM_like_dok)


Reply
user 2284A3F835F7156B2F432B82D8963D27 has internal id  768
tweet DCEF6C06DDE77C2DBE7F0BE99B95120A has internal id  603
user 768 has interacted on item 603 with True

Retweet
user EBBE15EB3C30A275BF87E7B9A676D12F has internal id  5256
tweet A3B8BEF795136AAA9E25B5173E80A73D has internal id  437
user 5256 has interacted on item 437 with True

Retweet with Comment
user 48918F9BDF36C80185112EF228F1429F has internal id  1642
tweet 089FE87D98654DA3323FE87552B86965 has internal id  27
user 1642 has interacted on item 27 with True

Like
user F343F23E25FF1D7041E31E0CF4D026AD has internal id  5422
tweet DE1604F4816F6B8BD85A9478AE9D32E9 has internal id  609
user 5422 has interacted on item 609 with True


0.0

In [228]:
# hidden


In [229]:
# feel free to edit

In [230]:
# feel free to edit

In [231]:
# feel free to edit

In [232]:
# feel free to edit

In [233]:
# feel free to edit

In [234]:
# feel free to edit

In [235]:
# feel free to edit

In [236]:
# feel free to edit

In [237]:
# feel free to edit

In [238]:
# feel free to edit