
# **Create Interactions Data**

In [1]:
import dask.dataframe as dd
import pandas as pd


In [12]:
df_attendees = dd.read_csv("/home/nkama/masters_thesis_project/thesis/data/event_rec_engine_challenge/event_attendees.csv",
                       dtype={
                           'event': 'object'
                       })



## Get User-Event Records


In [13]:
df_attendees.head()

Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
3,2621578336,,,,
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232


In [14]:
df_attendees.isnull().sum().compute()

event         0
yes        1984
maybe      3167
invited    1822
no         6659
dtype: int64

In [15]:

col = ['yes',	'maybe', 'invited',	'no']
df_attendee_new = df_attendees.dropna(subset=col, how='all')

In [16]:
df_attendee_new.head()

Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232
6,488116622,4145960786 2550625355 2577667841 1575121941 28...,1227223575 2789471603 1323321680 3086272918 38...,1413359297 2300232602 1412759254 617751520 286...,1498160155 3708150269 823488244 3595018395 173...


In [17]:
df = df_attendee_new[(df_attendee_new['yes'].isna() & df_attendee_new['maybe'].isna() & df_attendee_new['no'].isna())]

len(df)

272

In [18]:
# Fill remaining NaN values with 0
df_attendee_new = df_attendee_new.fillna('0')

In [19]:
#df_attendee_new.compute().to_csv("attendee_new.csv",index=False)


## Count total reactions on each event

We check if the input is '0' and return 0 if it is.

We convert the input to a string before splitting, to handle potential numeric inputs.

We filter out '0' values when counting users.

We renamed 'total_reactions' to 'total_users' for clarity.

These modifications ensure that:

A single '0' in a column is counted as 0 users.

'0' values mixed with other user IDs are not counted.


In [20]:
import dask.dataframe as dd
import pandas as pd
import numpy as np

# Function to count users in a space-separated string
def count_users(x):
    if pd.isna(x) or x == '0':
        return 0
    users = str(x).split()
    return len([u for u in users if u != '0'])

# Create new columns for user counts directly in the original dataframe
for column in ['yes', 'maybe', 'invited', 'no']:
    df_attendee_new[f'{column}_count'] = df_attendee_new[column].apply(count_users, meta=('x', 'int'))

# Calculate total attendees (sum of all categories)
df_attendee_new['total_users'] = df_attendee_new['yes_count'] + \
                              df_attendee_new['maybe_count'] + \
                              df_attendee_new['invited_count'] + \
                              df_attendee_new['no_count']

# Select only the event and count columns
attendance_counts = df_attendee_new[['event', 'yes_count', 'maybe_count', 'invited_count', 'no_count', 'total_users']]

attendance_counts = attendance_counts.compute()


In [21]:
attendance_counts.head()

Unnamed: 0,event,yes_count,maybe_count,invited_count,no_count,total_users
0,1159822043,7,7,70,2,86
1,686467261,11,8,75,0,94
2,1186208412,0,2,2,2,6
4,855842686,6,6,10,1,23
6,488116622,45,38,166,19,268


In [22]:
len(attendance_counts.event) #print number of unique events
#save to csv
#attendance_counts.to_csv("attendance_counts.csv", index=False)

22710

In [31]:
def extract_user_event_pairs(df_attendees):
    def process_row(row):
        event_id = row.event  
        pairs = []
        for status in ['yes', 'maybe', 'invited', 'no']:
            if pd.notna(getattr(row, status)) and getattr(row, status) != '0':
                users = str(getattr(row, status)).split()
                for user_id in users:
                    if user_id != '0':
                        # Check if the user is invited and has responded
                        if status == 'invited' and (row.yes != '0' or row.maybe != '0' or row.no != '0'):
                            attendance_status = 'invited & yes' if row.yes != '0' else ('invited & maybe' if row.maybe != '0' else 'invited & no')
                        else:
                            attendance_status = status
                        pairs.append({'event_id': event_id, 'user_id': user_id, 'interaction_type': attendance_status})
        return pairs

    # Apply the function to each partition
    pairs_ddf = df_attendees.map_partitions(
        lambda df: pd.DataFrame(
            [pair for row in df.itertuples() for pair in process_row(row)]
        )
    )

    return pairs_ddf

In [32]:

# Extracting user-event pairs
user_event_pairs = extract_user_event_pairs(df_attendee_new)


In [33]:
user_event_pairs = user_event_pairs.compute()

In [34]:
# Convert types to ensure consistency for joining or merging
user_event_pairs['user_id'] = user_event_pairs['user_id'].astype(str)
user_event_pairs['event_id'] = user_event_pairs['event_id'].astype(str)


In [43]:
user_event_pairs.head()

Unnamed: 0,event_id,user_id,interaction_type
0,1159822043,1975964455,yes
1,1159822043,252302513,yes
2,1159822043,4226086795,yes
3,1159822043,3805886383,yes
4,1159822043,1420484491,yes


In [42]:
user_event_pairs[user_event_pairs["interaction_type"]=="invited & yes"].head()

Unnamed: 0,event_id,user_id,interaction_type
14,1159822043,1723091036,invited & yes
15,1159822043,3795873583,invited & yes
16,1159822043,4109144917,invited & yes
17,1159822043,3560622906,invited & yes
18,1159822043,3106484834,invited & yes


In [38]:
len(user_event_pairs[user_event_pairs["interaction_type"]=="invited & yes"])

9398000

In [39]:
len(user_event_pairs[user_event_pairs["interaction_type"]=="invited & no"])

5587

In [40]:
len(user_event_pairs[user_event_pairs["interaction_type"]=="invited & maybe"])

9402

In [44]:
len(user_event_pairs)

11245010

In [55]:
user_event_pairs['interaction_label'] = user_event_pairs['interaction_type'].apply(
    lambda x: 1 if x in ['yes', 'maybe','invited & yes','invited & maybe'] 
    else (0 if pd.notna(x) else 0)
)


In [50]:
user_event_pairs.head()

Unnamed: 0,event_id,user_id,interaction_type,interaction_label
0,1159822043,1975964455,yes,1
1,1159822043,252302513,yes,1
2,1159822043,4226086795,yes,1
3,1159822043,3805886383,yes,1
4,1159822043,1420484491,yes,1


In [56]:
user_event_pairs[user_event_pairs["interaction_type"]=='invited & no']

Unnamed: 0,event_id,user_id,interaction_type,interaction_label
113411,2730531110,587226496,invited & no,0
113412,2730531110,2938205322,invited & no,0
113413,2730531110,1772527144,invited & no,0
131855,2706957558,905238652,invited & no,0
131856,2706957558,1793154035,invited & no,0
...,...,...,...,...
11167828,1807765078,277039486,invited & no,0
11167829,1807765078,2972745086,invited & no,0
11167830,1807765078,766348384,invited & no,0
11167831,1807765078,134853927,invited & no,0


In [57]:
user_event_pairs.isnull().sum()

event_id             0
user_id              0
interaction_type     0
interaction_label    0
dtype: int64

In [58]:
len(user_event_pairs)

11245010

In [59]:
len(user_event_pairs.event_id.unique()), len(user_event_pairs.user_id.unique())

(22710, 3702045)

In [None]:
user_event_pairs.interaction_type.value_counts()


interaction_type
invited & yes      9398000
yes                 831137
maybe               520724
no                  474438
invited & maybe       9402
invited               5722
invited & no          5587
Name: count, dtype: int64

In [61]:
#save as interactions data
user_event_pairs.to_csv("interactions_data.csv")


In [1]:
import pandas as pd
pd.read_csv("interactions_data.csv").head()   

Unnamed: 0.1,Unnamed: 0,event_id,user_id,interaction_type,interaction_label
0,0,1159822043,1975964455,yes,1
1,1,1159822043,252302513,yes,1
2,2,1159822043,4226086795,yes,1
3,3,1159822043,3805886383,yes,1
4,4,1159822043,1420484491,yes,1
