# Project

Welcome to the group project! The project is based on the [ACM RecSys 2021 Challenge](https://recsys-twitter.com/).

- Detailed information about the task, submission and grading can be found in a [dedicates site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1217340).
- Information about the dataset structure [on this site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1218810).

In [8]:
team_name = "team_15" # your team name e.g. 'team_1'
team_members = [("Markus Böck","01634838"),
                ("Luiza Corpaci","12037284"),
                ("Iulia Cristina Hatiegan", "01302969"),
                ("Adriana-Maria Railean", "01304039"),
                ("", "")] # [("Jane Doe","012345678"), ("John Doe","012345678")]


In [9]:
print(team_name)
print(team_members)

team_15
[('Markus Böck', '01634838'), ('Luiza Corpaci', '12037284'), ('Iulia Cristina Hatiegan', '01302969'), ('Adriana-Maria Railean', '01304039'), ('', '')]


In [10]:
path_to_data = './data/project/training/'
val_path_to_data = './data/project/validation/'
dataset_type = 'one_hour' # all_sorted, one_day, one_hour, one_week
val_dataset_type = "one_hour"
expanded_path = os.path.expanduser(path_to_data)
part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
part_files = sorted(part_files, key = lambda x:x[-5:])

In [11]:
try:
    import pandas as pd
except:
    if hasattr(sys, 'real_prefix'):
        #we are in a virtual env.
        !pip3 install pandas
    else:
        !pip3 install --user pandas

In [18]:
import os
import re
import csv
import datetime

from model import reply_pred_model, retweet_pred_model, quote_pred_model, fav_pred_model 

all_features =  ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                        "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
                       "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
                       "engaging_user_id", "enaging_user_follower_count", "enaging_user_following_count", "enaging_user_is_verified",\
                       "enaging_user_account_creation", "engagee_follows_engager", "reply", "retweet", "quote", "like"]
all_features_to_idx = dict(zip(all_features, range(len(all_features))))

def parse_input_line(line):
    features = line #.split("\x01")
    tweet_id = features[all_features_to_idx['tweet_id']]
    user_id = features[all_features_to_idx['engaging_user_id']]
    input_feats = features[all_features_to_idx['text_tokens']]
    tweet_timestamp = features[all_features_to_idx['tweet_timestamp']]
    
    return tweet_id, user_id, input_feats, tweet_timestamp



def evaluate_test_set():
    expanded_path = os.path.expanduser(path_to_data)
    part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
    part_files = sorted(part_files, key = lambda x:x[-5:]) 
        
    with open('results.csv', 'w') as output:
        for file in part_files:
            with open(file, 'r') as f:
                linereader = csv.reader(f, delimiter='\x01')
                last_timestamp = None
                for row in linereader:
                    tweet_id, user_id, features, tweet_timestamp = parse_input_line(row)                                       
                    reply_pred = reply_pred_model(features) # reply_model
                    retweet_pred = retweet_pred_model(features) # retweet_model
                    quote_pred = quote_pred_model(features) # pred_model
                    fav_pred = fav_pred_model(features) # fav_model
                    
                    # print(str(tweet_timestamp))
                    # print(str(reply_pred)+" "+str(retweet_pred)+" "+str(quote_pred)+" "+str(fav_pred))
                    
                    output.write(f'{tweet_id},{user_id},{reply_pred},{retweet_pred},{quote_pred},{fav_pred}\n')


In [19]:
def load_data(filename):
        data = pd.read_csv(filename, sep='\x01', names=all_features, index_col=False)
        return data
    

In [20]:
# evaluate_test_set()

## Data Analysis & Baselines
#### Author: Markus Böck

In [21]:
import pandas as pd
import numpy as np
import time

import matplotlib.pyplot as plt

Collecting statistics:
- Number of rows
- Numer of unique users
- Number of unique tweets
- Percentage of engagements (reply, retweet, quote, like)
- Percentage of users of validation set appearing in training data

In [23]:
%%time

res = {}
for dataset_type in ["one_hour", "one_day", "one_week"]:
    expanded_path = os.path.expanduser(path_to_data)
    part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
    part_files = sorted(part_files, key = lambda x:x[-5:])
    part_files
    print(part_files)

    tweet_counts = dict()
    user_counts = dict()
    
    nreply = 0
    nretweet = 0
    nquote = 0
    nlike = 0
    nengagement = 0

    nrows = 0
    with open(part_files[0], 'r') as f:
        linereader = csv.reader(f, delimiter='\x01')
        last_timestamp = None
        i = 0
        tik = time.time()
        for row in linereader:
            tweet_id = row[all_features_to_idx['tweet_id']]
            user_id = row[all_features_to_idx['engaging_user_id']]
            
            reply = row[all_features_to_idx['reply']] != ""
            retweet = row[all_features_to_idx['retweet']] != ""
            quote = row[all_features_to_idx['quote']] != ""
            like = row[all_features_to_idx['like']] != ""
                
            nreply += reply
            nretweet += retweet
            nquote += quote
            nlike += like
            nengagement += (reply or retweet or quote or like)
                        
            v = 0
            try:
                v = tweet_counts[tweet_id]
            except:
                pass
            tweet_counts[tweet_id] = v + 1

            v = 0
            try:
                v = user_counts[user_id]
            except:
                pass
            user_counts[user_id] = v + 1

            if i % 100000 == 0:
                tok = time.time()
                print(f"{i} {100000/(tok-tik): .2f} iter/s", end="\r")
                tik = time.time()

            i += 1

        nrows = i
        
        stats = {"nrows": nrows}
        
        stats["nreply"] = nreply
        stats["nretweet"] = nretweet
        stats["nquote"] = nquote
        stats["nlike"] = nlike
        stats["nengagement"] = nengagement
        
        stats["unique_tweets"] = len(tweet_counts)
        stats["unique_users"] = len(user_counts)
        
        percs = []
        for i in range(1,11):
            perc = sum([1 for (j,c) in tweet_counts.items() if c >= i])  / len(tweet_counts)
            percs.append(perc)
        stats["tweet_dist"] = percs
        
        percs = []
        for i in range(1,11):
            perc = sum([1 for (j,c) in user_counts.items() if c >= i])  / len(user_counts)
            percs.append(perc)
        stats["user_dist"] = percs
        
        stats["val_user_perc"] = len(set(val_data["engaging_user_id"].unique()).intersection(set(user_counts.keys()))) / val_data.shape[0] * 100
        stats["val_tweet_perc"] = len(set(val_data["tweet_id"].unique()).intersection(set(tweet_counts.keys()))) / val_data.shape[0] * 100
        
        res[dataset_type] = stats

['./data/project/training/one_hour.csv']


ZeroDivisionError: float division by zero

All data needs special treatment. Exploit the fact that this set is ordered according to tweet id.

In [None]:
%%time

dataset_type = "all_sorted"
expanded_path = os.path.expanduser(path_to_data)
part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
part_files = sorted(part_files, key = lambda x:x[-5:])
part_files
print(part_files)

current_tweet_id = ""
tweet_count = 1
tweet_dist = [0]*10
n_unique_tweets = 0

user_counts = dict()

nreply = 0
nretweet = 0
nquote = 0
nlike = 0
nengagement = 0

nrows = 0
with open(part_files[0], 'r') as f:
    linereader = csv.reader(f, delimiter='\x01')
    last_timestamp = None
    i = 0
    tik = time.time()
    for row in linereader:
        tweet_id = row[all_features_to_idx['tweet_id']]
        user_id = row[all_features_to_idx['engaging_user_id']]

        #print(tweet_id)
        
        if current_tweet_id == tweet_id:
            tweet_count += 1
        else:
            #print(tweet_id, tweet_count)
            for j in range(1, 11):
                if tweet_count >= j:
                    tweet_dist[j-1] += 1
            tweet_count = 1
            current_tweet_id = tweet_id
            n_unique_tweets += 1
                

        reply = row[all_features_to_idx['reply']] != ""
        retweet = row[all_features_to_idx['retweet']] != ""
        quote = row[all_features_to_idx['quote']] != ""
        like = row[all_features_to_idx['like']] != ""

        nreply += reply
        nretweet += retweet
        nquote += quote
        nlike += like
        nengagement += (reply or retweet or quote or like)


        v = 0
        try:
            v = user_counts[user_id]
        except:
            pass
        user_counts[user_id] = v + 1

        if i % 100000 == 0:
            tok = time.time()
            print(f"{i} {100000/(tok-tik): .2f} iter/s", end="\r")
            tik = time.time()

        i += 1


    nrows = i

    stats = {"nrows": nrows}

    stats["nreply"] = nreply
    stats["nretweet"] = nretweet
    stats["nquote"] = nquote
    stats["nlike"] = nlike
    stats["nengagement"] = nengagement

    stats["unique_tweets"] = n_unique_tweets
    stats["unique_users"] = len(user_counts)

    stats["tweet_dist"] = [d/n_unique_tweets for d in tweet_dist]

    percs = []
    for i in range(1,11):
        perc = sum([1 for (j,c) in user_counts.items() if c >= i])  / len(user_counts)
        percs.append(perc)
    stats["user_dist"] = percs

    stats["val_user_perc"] = len(set(val_data["engaging_user_id"].unique()).intersection(set(user_counts.keys()))) / val_data.shape[0] * 100
    stats["val_tweet_perc"] = 0.

res[dataset_type] = stats

In [None]:
del tweet_counts
del user_counts

In [None]:
# Print statistics
for (k,stats) in res.items():
    print(k)
    print("\tnrows", stats["nrows"])
    
    print("\tnreply", stats["nreply"], "-", round(stats["nreply"]/stats["nrows"]*100,2), "%")
    print("\tnretweet", stats["nretweet"], "-", round(stats["nretweet"]/stats["nrows"]*100,2), "%")
    print("\tnquote", stats["nquote"], "-", round(stats["nquote"]/stats["nrows"]*100,2), "%")
    print("\tnlike", stats["nlike"], "-", round(stats["nlike"]/stats["nrows"]*100,2), "%")
    print("\tnengagement", stats["nengagement"], "-", round(stats["nengagement"]/stats["nrows"]*100,2), "%")
          
    print("\tunique_tweets", stats["unique_tweets"])
    print("\tunique_users", stats["unique_users"])
    print("\tval_user_perc", stats["val_user_perc"])
    try:
        print("\tval_tweet_perc", stats["val_tweet_perc"])
    except KeyError:
        pass

In [None]:
fig = plt.figure()
for (k,stats) in res.items():
    dist = stats["tweet_dist"]
    plt.plot(range(1,len(dist)+1), dist, label=k)
    
plt.legend()
plt.title("% of tweets appearing more than n times")
plt.xlabel("n")
#plt.yscale("log")
plt.savefig("tweets.pdf")

In [None]:
fig = plt.figure()
for (k,stats) in res.items():
    dist = stats["user_dist"]
    plt.plot(range(1,len(dist)+1), dist, label=k)
    
plt.legend()
plt.title("% of users appearing more than n times")
plt.xlabel("n")
#plt.yscale("log")
plt.savefig("users.pdf")

In [None]:
def columns_to_list(data, columns):
    
    for col in columns:
        data[col] = data[col].str.split('\t')
    return data


def columns_to_timestamps(data, columns):
    for col in columns:  
        data[col] = data[col].apply(lambda x: pd.Timestamp(x, unit='s'))
        
    return data
    
cols_to_list = ['text_tokens', 'hashtags', 'present_media', 'present_links', 'present_domains']
data = columns_to_list(data, cols_to_list)    

cols_to_timestamps = ['tweet_timestamp', 'enaging_user_account_creation', 'reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
data = columns_to_timestamps(data, cols_to_timestamps)  

In [None]:
pd.set_option('display.max_columns', None)
print(data.shape)
display(data.head(50))

### Baselines

In [None]:
for (k,stats) in res.items():
    print(k)
    
    nvalrows = val_data.shape[0]
    reply_mean = stats["nreply"]/stats["nrows"]
    retweet_mean = stats["nretweet"]/stats["nrows"]
    quote_mean = stats["nquote"]/stats["nrows"]
    like_mean = stats["nlike"]/stats["nrows"]
    
    p = np.full(nvalrows, reply_mean)
    gt = val_data.reply
    print("reply:", "rce", compute_rce(p, gt), "avgprec", average_precision_score(gt, p))
    p = np.full(nvalrows, retweet_mean)
    gt = val_data.retweet
    print("retweet:", "rce", compute_rce(p, gt), "avgprec", average_precision_score(gt, p))
    p = np.full(nvalrows, quote_mean)
    gt = val_data.quote
    print("quote:", "rce", compute_rce(p, gt), "avgprec", average_precision_score(gt, p))
    p = np.full(nvalrows, like_mean)
    gt = val_data.like
    print("like:", "rce", compute_rce(p, gt), "avgprec", average_precision_score(gt, p))

In [None]:
#### Predicting no engagement

In [None]:
nvalrows = val_data.shape[0]
reply_pred = 0.
retweet_pred = 0.
quote_pred = 0.
like_pred = 0.

p = np.full(nvalrows, reply_pred)
gt = val_data.reply
print("reply:", "rce", compute_rce(p, gt), "avgprec", average_precision_score(gt, p))
p = np.full(nvalrows, retweet_pred)
gt = val_data.retweet
print("retweet:", "rce", compute_rce(p, gt), "avgprec", average_precision_score(gt, p))
p = np.full(nvalrows, quote_pred)
gt = val_data.quote
print("quote:", "rce", compute_rce(p, gt), "avgprec", average_precision_score(gt, p))
p = np.full(nvalrows, like_pred)
gt = val_data.like
print("like:", "rce", compute_rce(p, gt), "avgprec", average_precision_score(gt, p))

# Splitting dataset into train and test
 Splitting the training set - one hour into train and test data. The training dataset is used for model training and the test dataset for testing the trained model

In [None]:
from sklearn.model_selection import train_test_split

data = load_data(path_to_data + dataset_type)

# We choose first 5k rows in order to work faster with the data
data = data.head(5000)

train_data, test_data = train_test_split(data, test_size= 0.20, random_state=42)

In [None]:
train_data.head()

In [None]:
test_data.head()

# Evaluation

In [None]:

def true_timestamp(t):
    return int(not pd.isnull(t))

def labels(j):
    to_copy = test_data.copy()
    to_copy['labed'] = to_copy.apply(lambda row: true_timestamp(row[j]), axis=1)
    return to_copy[['tweet_id', 'engaging_user_id', 'labed']]

def read_predictions(file):
    filename = os.path.basename(file)
    #print(filename)     
    if (filename.startswith('gt')):
        to_sort = pd.read_csv(file, names=['tweet_id', 'engaging_user_id', 'labed'], header=0)
        sort = to_sort.sort_values(['tweet_id', 'engaging_user_id', 'labed'])
    elif (filename.startswith('pred')):
         to_sort = pd.read_csv(file, names=['tweet_id', 'engaging_user_id', 'prediction'], header=0)
         sort = to_sort.sort_values(['tweet_id', 'engaging_user_id', 'prediction'])
    return sort


#ground truth for retweet
gt_retweet = labels('retweet_timestamp')
gt_retweet.to_csv('gt_retweet.csv')
print(read_predictions('gt_retweet.csv')[:10])

#ground truth for reply
gt_reply = labels('reply_timestamp')
gt_reply.to_csv('gt_reply.csv')
print(read_predictions('gt_reply.csv')[:10])

#ground truth for like
gt_like = labels('like_timestamp')
gt_like.to_csv('gt_like.csv')
print(read_predictions('gt_like.csv')[:10])

#ground truth for retweet with comment
gt_rc = labels('retweet_with_comment_timestamp')
gt_rc.to_csv('gt_rc.csv')
print(read_predictions('gt_rc.csv')[:10])



# Create a Ratings Matrix
### One ratings matrix for each engagement type 

In [None]:
#creating a data frame for the unique tweets and a unique one for the engagement between users
uTID = data['tweet_id'].unique()
uTID.sort()

uUID = data['engaging_user_id'].append(data['engaged_with_user_id']).unique()
uUID.sort()

m = len(uUID)
n = len(uTID)

#creating internal ids for the users and the tweets
userId_to_userIDX = dict(zip(uUID, range(m)))
userIDX_to_userId = dict(zip(range(m), uUID))

tweetId_to_tweetIDX = dict(zip(uTID, range(n)))
tweetIDX_to_tweetId = dict(zip(range(n), uTID))

In [None]:
#creating a dataframe for the upcoming implementation of the ratings matrix 
j = ['tweet_id', 'engaging_user_id', 'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp']

ratings = pd.concat([data['engaging_user_id'].map(userId_to_userIDX),
                  data['tweet_id'].map(tweetId_to_tweetIDX),
                  data['reply_timestamp'].notnull(),
                  data['retweet_timestamp'].notnull(),
                  data['retweet_with_comment_timestamp'].notnull(),
                  data['like_timestamp'].notnull()], axis = 1)

ratings.columns = ['user', 'tweet', 'reply', 'retweet', 'retweet_with_comment', 'like']
ratings.sort_values(['user', 'tweet'], inplace = True)

ratings.head(n = 20)

In [None]:
from scipy import sparse as sp

#creating the ratings matrices

RM_reply = sp.csr_matrix((ratings.reply[ratings.reply], (ratings.user[ratings.reply], ratings.tweet[ratings.reply])), 
            shape=(m, n))

RM_retweet = sp.csr_matrix((ratings.retweet[ratings.retweet], (ratings.user[ratings.retweet], ratings.tweet[ratings.retweet])), 
            shape=(m, n))

RM_retweet_wc = sp.csr_matrix((ratings.retweet_with_comment[ratings.retweet_with_comment], (ratings.user[ratings.retweet_with_comment]             , ratings.tweet[ratings.retweet_with_comment])), shape=(m, n))

RM_like = sp.csr_matrix((ratings.like[ratings.like], (ratings.user[ratings.like], ratings.tweet[ratings.like])), 
            shape=(m, n))

display(RM_reply.shape, RM_reply.count_nonzero())
display(RM_retweet.shape, RM_retweet.count_nonzero())
display(RM_retweet_wc.shape, RM_retweet_wc.count_nonzero())
display(RM_like.shape, RM_like.count_nonzero())

# User-User Similarity

In [None]:
from scipy.sparse.linalg import norm

def compute_pairwise_user_similarity(u_id, v_id, RM_type):
    u = RM_type[u_id,:].copy()
    v = RM_type[v_id,:].copy()
    
    #cosine similarity formula from the slides based on the vector operations defined above
    numerator = u.dot(v.T).A.item()
    denominator = norm(u)*norm(v)
    
    if denominator == 0:
        similarity = 0.;
    else: 
        similarity = numerator/denominator
    
    return similarity

In [None]:
#testing the function above
display(compute_pairwise_user_similarity(15, 5256, RM_reply))
display(compute_pairwise_user_similarity(5256, 1642, RM_retweet))
display(compute_pairwise_user_similarity(1642, 5422, RM_retweet_wc))
display(compute_pairwise_user_similarity(5422, 15, RM_like))

# User to all Users Similarity

In [None]:
import numpy as np

def compute_user_similarities(u_id, RM_type):
    uU = np.empty((m,))

    #computing similarities of user u_id with all of the other users
    for v_id in range(m):
        uU[v_id] = compute_pairwise_user_similarity(u_id, v_id, RM_type)
    
    return uU

In [None]:
# Test 
uU = compute_user_similarities(15, RM_reply)
display(uU[1])

uU = compute_user_similarities(5256, RM_retweet)
display(uU[50])

uU = compute_user_similarities(1642, RM_retweet_wc)
display(uU[10])

uU = compute_user_similarities(5422, RM_like)
display(uU[10])

#  User Neighbourhood

In [None]:
#transforming from sparse matrix to dictionary of keys for easier handling
RM_reply_dok = RM_reply.todok()
RM_retweet_dok = RM_retweet.todok()
RM_retweet_wc_dok = RM_retweet_wc.todok()
RM_like_dok = RM_like.todok()

k = 10

def create_user_neighborhood(u_id, i_id, RM_type, RM_type_dok):
    nh = {} ## the neighborhood dict with (user id: similarity) entries
    ## nh should not contain u_id and only include users that have rated i_id; there should be at most k neighbors
    uU = compute_user_similarities(u_id, RM_type)
    uU_copy = uU.copy() ## so that we can modify it, but also keep the original
    
    sorted_values = np.argsort(uU_copy)[::-1]    
        
   #counter for k neighbours
    ik = 0 
    for i in sorted_values:
        # checking if i gave a rating to item i_id and making sure i is different from itself
        if (i, i_id) in RM_type_dok and i!=u_id:
            nh[i] = uU_copy[i]
            ik+=1
        if ik == k:
            break

    return nh

In [None]:
# Test neighborhood

nh = create_user_neighborhood(15, 595, RM_reply, RM_reply_dok)
display(nh)

nh = create_user_neighborhood(5256, 437, RM_retweet, RM_retweet_dok)
display(nh)

nh = create_user_neighborhood(1642, 27, RM_retweet_wc, RM_retweet_wc_dok)
display(nh)

nh = create_user_neighborhood(5422, 609, RM_like, RM_like_dok)
display(nh)

# Predict Ratings

In [None]:
def predict_internal_ids(u_id, i_id, RM_type, RM_type_dok):

    if (u_id, i_id) in RM_type_dok:
        print("user", u_id, "has engaged with item", i_id, "with", RM_type[u_id, i_id])
    else:
        print("user", u_id, "has not engaged with item", i_id)
        print("k:", k)


    nh = create_user_neighborhood(u_id, i_id, RM_type, RM_type_dok)

    neighborhood_weighted_avg = 0.
    numerator = 0.
    denominator = 0.

    for v in nh.items():
        numerator += nh[v] * RM_type[v,i_id]

        denominator += np.absolute(nh[v])


    if denominator == 0:
        neighborhood_weighted_avg = 0.;
    else:
        neighborhood_weighted_avg = numerator/denominator


    prediction = neighborhood_weighted_avg

    return prediction

In [None]:
#test
predict_internal_ids(15, 595, RM_reply, RM_reply_dok)

In [None]:
def predict_external_ids(tweet_id, engaging_user_id, RM_type, RM_type_dok):
    print("user", engaging_user_id, "has internal id ", userId_to_userIDX[engaging_user_id])
    print("tweet", tweet_id, "has internal id ", tweetId_to_tweetIDX[tweet_id])
    return predict_internal_ids(userId_to_userIDX[engaging_user_id],tweetId_to_tweetIDX[tweet_id], RM_type, RM_type_dok)

In [None]:
#testing different external ids 

print("Reply")
predict_external_ids("DCEF6C06DDE77C2DBE7F0BE99B95120A", "2284A3F835F7156B2F432B82D8963D27", RM_reply, RM_reply_dok)

print("")
print("Retweet")
predict_external_ids("A3B8BEF795136AAA9E25B5173E80A73D", "EBBE15EB3C30A275BF87E7B9A676D12F", RM_retweet, RM_retweet_dok)

print("")
print("Retweet with Comment")
predict_external_ids("089FE87D98654DA3323FE87552B86965", "48918F9BDF36C80185112EF228F1429F", RM_retweet_wc, RM_retweet_wc_dok)

print("")
print("Like")
predict_external_ids("DE1604F4816F6B8BD85A9478AE9D32E9", "F343F23E25FF1D7041E31E0CF4D026AD", RM_like, RM_like_dok)


## Item-Item Collaborative Filtering
#### Author: Markus Böck

In [None]:
from model import *

In [None]:
%%time
iicf = IICF(path_to_data, "one_day")

In [None]:
import os
import re
import csv
import datetime


def evaluate_test_set(path_to_data, dataset_type):
    expanded_path = os.path.expanduser(path_to_data)
    part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
    part_files = sorted(part_files, key = lambda x:x[-5:]) 
        
    i = 0
    with open('results.csv', 'w') as output:
        for file in part_files:
            with open(file, 'r') as f:
                linereader = csv.reader(f, delimiter='\x01')
                last_timestamp = None
                for row in linereader:
                    i += 1
                    tweet_id, user_id, features, follow, tweet_timestamp = iicf.parse_input_features(row) 

                    reply_pred, retweet_pred, quote_pred, fav_pred = iicf.predict(tweet_id, user_id, features, follow)
                    
                    # print(str(tweet_timestamp))
                    # print(str(reply_pred)+" "+str(retweet_pred)+" "+str(quote_pred)+" "+str(fav_pred))
                    
                    output.write(f'{tweet_id},{user_id},{reply_pred},{retweet_pred},{quote_pred},{fav_pred}\n')
                    
                    if i % 1000 == 0:
                        print(f"Predicted {i} rows.", end="\r")

    print(f"Predicted {i} rows.")

In [None]:
%%time
evaluate_test_set(val_path_to_data, val_dataset_type)

In [None]:
from sklearn.metrics import average_precision_score, log_loss

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])

    ctr = positive/float(len(gt))

    return ctr

def compute_rce(pred, gt):

    cross_entropy = log_loss(gt, pred)

    data_ctr = calculate_ctr(gt)

    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])

    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [None]:
val_expanded_path = os.path.expanduser(val_path_to_data)
val_part_files = [os.path.join(val_expanded_path, f) for f in os.listdir(val_expanded_path) if val_dataset_type in f]
val_part_files = sorted(val_part_files, key = lambda x:x[-5:])
val_part_files
val_data = pd.read_csv(val_part_files[0], delimiter='\x01', header=None, usecols=[2, 14, 20,21,22,23])
val_data.columns = ["tweet_id", "engaging_user_id", 'reply', 'retweet', 'quote', 'like']

val_data.reply = (~val_data.reply.isna()).astype("int")
val_data.retweet = (~val_data.retweet.isna()).astype("int")
val_data.quote = (~val_data.quote.isna()).astype("int")
val_data.like = (~val_data.like.isna()).astype("int")

val_data

In [None]:
results = pd.read_csv("results.csv", header=None)
results.columns = ["tweet_id", "user_id", "reply", "retweet", "quote", "like"]
results

In [None]:
print("Reply scores:")
compute_rce(results.reply, val_data.reply), average_precision_score(val_data.reply, results.reply)

In [None]:
print("Retweet scores:")
compute_rce(results.retweet, val_data.retweet), average_precision_score(val_data.retweet, results.retweet)

In [None]:
print("Quote scores:")
compute_rce(results.quote, val_data.quote), average_precision_score(val_data.quote, results.quote)

In [None]:
print("Like scores:")
compute_rce(results.like, val_data.like), average_precision_score(val_data.like, results.like)

In [None]:
del iicf # free up memory

## Fairness


In [None]:
from sklearn.metrics import average_precision_score, log_loss

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])

    ctr = positive/float(len(gt))

    return ctr

def compute_rce(pred, gt):

    cross_entropy = log_loss(gt, pred)

    data_ctr = calculate_ctr(gt)

    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])

    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0


In [None]:
def read_predictions_fairness(path):
    pred = pd.read_csv(path, header=None, names=['tweet_id', 'user_id', 'reply', 'retweet', 'quote', 'like'])
    return pred


def read_predictions(path, col):
    pred_col = {"reply": 2, "retweet": 3, "quote": 4, "like": 5}[col]
    pred = pd.read_csv(path, header=None, usecols=[0, 1, pred_col], names=['tweet_id', 'user_id', 'reply', 'retweet', 'quote', 'like'])
    return pred



In [None]:
def parse_line(row):
    tweet_id = row[all_features_to_idx['tweet_id']]
    user_id = row[all_features_to_idx['engaging_user_id']]
    
#     input_feats = np.zeros((tweet_features.shape[1],),dtype=np.float32)
    
    follower_count= int(row[all_features_to_idx["engaged_with_user_follower_count"]])
    following_count = int(row[all_features_to_idx["engaged_with_user_following_count"]])
    verified = bool(row[all_features_to_idx["engaged_with_user_is_verified"]])
    
    return tweet_id, user_id, follower_count, following_count, verified

In [None]:
def tweets_data(path, dataset_type):
    expanded_path = os.path.expanduser(path)
    part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
    part_files = sorted(part_files, key = lambda x:x[-5:]) 
    
    tweet_groups = pd.DataFrame(columns=['tweet_id', 'user_id', 'follower_count', 'following_count', 'verified'])
    
    for file in part_files:
        with open(file, 'r') as f:
            tweet_ids = get_tweet_ids(file)
            user_ids = get_user_ids(file)
            linereader = csv.reader(f, delimiter='\x01')
            last_timestamp = None
            for i, row in enumerate(linereader):
                tweet_id, user_id, follower_count, following_count, verified = parse_line(row) 
                tweet_id_int = tweet_ids[tweet_id]
                user_id_int = user_ids[user_id]
                dic = {'tweet_id':tweet_id_int, 'user_id':user_id_int,\
                       'follower_count':follower_count, 'following_count':following_count, 'verified':verified}
                tweet_groups = tweet_groups.append(dic, ignore_index=True)
    return tweet_ids, user_ids, tweet_groups


tweet_ids, user_ids, tweet_groups = tweets_data(val_path_to_data, 'one_hour')

In [None]:
def group_by_followers(df):
    data = df.copy()
    data = data.sort_values(by='follower_count', ascending=False)
    data['group'] = np.zeros((len(data)), dtype=np.int32)
    
    for i in range(0,round(len(data)/5)):
        data.loc[i, 'group'] = 0
        
    for i in range(round(len(data)/5), 2*round(len(data)/5)):
        data.loc[i, 'group'] = 1
        
    for i in range(2*round(len(data)/5), 3*round(len(data)/5)):
        data.loc[i, 'group'] = 2
        
    for i in range(3*round(len(data)/5), 4*round(len(data)/5)):
        data.loc[i, 'group'] = 3
        
    for i in range(4*round(len(data)/5), len(data)):
        data.loc[i, 'group'] = 4
    
    return data

groups = group_by_followers(tweet_groups)



In [None]:
from sklearn.metrics import average_precision_score, log_loss


ground_truth = read_predictions("gt.csv", 'reply') # will return data in the form (tweet_id, user_id, labed (1 or 0))
predictions = read_predictions("results.csv", 'reply') # will return data in the form (tweet_id, user_id, prediction)

predictions['tweet_id'] = predictions['tweet_id'].map(tweet_ids)
predictions['user_id'] = predictions['user_id'].map(user_ids)
ground_truth['tweet_id'] = ground_truth['tweet_id'].map(tweet_ids)
ground_truth['user_id'] = ground_truth['user_id'].map(user_ids)

predictions = pd.merge(predictions, groups[['user_id', 'group']], how='left', on = 'user_id')
ground_truth = pd.merge(ground_truth, groups[['user_id', 'group']], how='left', on = 'user_id')


In [None]:
# splitting for groups (this is assuming ground_truth is a csv again)
# also assuming that predictions has class membership. We should sort this out in the least painful way
# predictions has schema (tweet_id, user_id, , group, prediction)

col = 'reply'
pred_col = {"reply": 2, "retweet": 3, "quote": 4, "like": 5}[col]

rce = {}
average_precision = {}
accuracy = {}
for i in range(5):
    group_predictions = [p[pred_col] for p in predictions.values if p[-1] == i]
    group_ground_truth = [p[pred_col] for p in ground_truth.values if p[-1] == i]
    rce[i] = compute_rce(group_predictions, group_ground_truth)
#     average_precision[i] = average_precision_score(np.array(group_predictions), np.array(group_ground_truth))
#     accuracy[i] = np.mean(group_predictions == group_ground_truth)


print('The rces for the groups of popularity:')
for i in range(5):
    print("RCE for group {0}:".format(i), rce[i])
#     print("average_precision:", average_precision[i])
#     print("accuracy:", accuracy[i])

#### zeroR
rce: -621.8494688006682  

#### item item
rce: -233.43933830426766

### Group by user verification

In [None]:

groups = tweet_groups[['tweet_id', 'user_id', 'verified']]


In [None]:
ground_truth = read_predictions("gt.csv", 'reply') # will return data in the form (tweet_id, user_id, labed (1 or 0))
predictions = read_predictions("results.csv", 'reply') # will return data in the form (tweet_id, user_id, prediction)

predictions['tweet_id'] = predictions['tweet_id'].map(tweet_ids)
predictions['user_id'] = predictions['user_id'].map(user_ids)
ground_truth['tweet_id'] = ground_truth['tweet_id'].map(tweet_ids)
ground_truth['user_id'] = ground_truth['user_id'].map(user_ids)

predictions = pd.merge(predictions, groups[['user_id', 'verified']], how='left', on = 'user_id')
ground_truth = pd.merge(ground_truth, groups[['user_id', 'verified']], how='left', on = 'user_id')



In [None]:
# splitting for groups (this is assuming ground_truth is a csv again)
# also assuming that predictions has class membership. We should sort this out in the least painful way
# predictions has schema (tweet_id, user_id, , group, prediction)

col = 'reply'
pred_col = {"reply": 2, "retweet": 3, "quote": 4, "like": 5}[col]

group_predictions_true = [p[pred_col] for p in predictions.values if p[-1] == True]
group_ground_truth_true = [p[pred_col] for p in ground_truth.values if p[-1] == True]
rce_true = compute_rce(group_predictions, group_ground_truth)


group_predictions_false = [p[pred_col] for p in predictions.values if p[-1] == False]
group_ground_truth_false = [p[pred_col] for p in ground_truth.values if p[-1] == False]
rce_false = compute_rce(group_predictions, group_ground_truth)
#     average_precision[i] = average_precision_score(np.array(group_predictions), np.array(group_ground_truth))
#     accuracy[i] = np.mean(group_predictions == group_ground_truth)


print('The rces for the groups of users with verified accounts vs not verified ones:')
print("RCE for verified users:", rce_true)
print("RCE for non-verified users:", rce_false)
#     print("average_precision:", average_precision[i])
#     print("accuracy:", accuracy[i])

In [None]:
# hidden


In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit