
# **Create Interactions Data**

In [1]:
import dask.dataframe as dd
import pandas as pd


In [6]:
df_attendees = dd.read_csv("/home/nkama/masters_thesis_project/thesis/data/event_rec_engine_challenge/event_attendees.csv",
                       dtype={
                           'event': 'object'
                       })



## Get User-Event Records


In [7]:
df_attendees.head()

Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
3,2621578336,,,,
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232


In [8]:
df_attendees.isnull().sum().compute()

event         0
yes        1984
maybe      3167
invited    1822
no         6659
dtype: int64

In [9]:

col = ['yes',	'maybe', 'invited',	'no']
df_attendee_new = df_attendees.dropna(subset=col, how='all')

In [10]:
df_attendee_new.head()

Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232
6,488116622,4145960786 2550625355 2577667841 1575121941 28...,1227223575 2789471603 1323321680 3086272918 38...,1413359297 2300232602 1412759254 617751520 286...,1498160155 3708150269 823488244 3595018395 173...


In [36]:
df = df_attendee_new[(df_attendee_new['yes'].isna() & df_attendee_new['maybe'].isna() & df_attendee_new['no'].isna())]

len(df)

0

In [11]:
# Fill remaining NaN values with 0
df_attendee_new = df_attendee_new.fillna('0')

In [10]:
df_attendee_new.compute().to_csv("attendee_new.csv",index=False)


## Count total reactions on each event

We check if the input is '0' and return 0 if it is.

We convert the input to a string before splitting, to handle potential numeric inputs.

We filter out '0' values when counting users.

We renamed 'total_reactions' to 'total_users' for clarity.

These modifications ensure that:

A single '0' in a column is counted as 0 users.

'0' values mixed with other user IDs are not counted.


In [12]:
import dask.dataframe as dd
import pandas as pd
import numpy as np

# Function to count users in a space-separated string
def count_users(x):
    if pd.isna(x) or x == '0':
        return 0
    users = str(x).split()
    return len([u for u in users if u != '0'])

# Create new columns for user counts directly in the original dataframe
for column in ['yes', 'maybe', 'invited', 'no']:
    df_attendee_new[f'{column}_count'] = df_attendee_new[column].apply(count_users, meta=('x', 'int'))

# Calculate total attendees (sum of all categories)
df_attendee_new['total_users'] = df_attendee_new['yes_count'] + \
                              df_attendee_new['maybe_count'] + \
                              df_attendee_new['invited_count'] + \
                              df_attendee_new['no_count']

# Select only the event and count columns
attendance_counts = df_attendee_new[['event', 'yes_count', 'maybe_count', 'invited_count', 'no_count', 'total_users']]

attendance_counts = attendance_counts.compute()


In [13]:
attendance_counts.head()

Unnamed: 0,event,yes_count,maybe_count,invited_count,no_count,total_users
0,1159822043,7,7,70,2,86
1,686467261,11,8,75,0,94
2,1186208412,0,2,2,2,6
4,855842686,6,6,10,1,23
6,488116622,45,38,166,19,268


In [14]:
len(attendance_counts.event) #print number of unique events
#save to csv
#attendance_counts.to_csv("attendance_counts.csv", index=False)

22710

In [15]:
def extract_user_event_pairs(df_attendees):
    def process_row(row):
        event_id = row.event  # Changed from row['event']
        pairs = []
        for status in ['yes', 'maybe', 'invited', 'no']:
            if pd.notna(getattr(row, status)) and getattr(row, status) != '0':
                users = str(getattr(row, status)).split()
                for user_id in users:
                    if user_id != '0':
                        # Check if the user is invited and has responded
                        if status == 'invited' and (row.yes != '0' or row.maybe != '0' or row.no != '0'):
                            attendance_status = 'yes' if row.yes != '0' else ('maybe' if row.maybe != '0' else 'no')
                        else:
                            attendance_status = status
                        pairs.append({'event_id': event_id, 'user_id': user_id, 'attendance_status': attendance_status})
        return pairs

    # Apply the function to each partition
    pairs_ddf = df_attendees.map_partitions(
        lambda df: pd.DataFrame(
            [pair for row in df.itertuples() for pair in process_row(row)]
        )
    )

    return pairs_ddf

In [9]:
import dask.dataframe as dd
import pandas as pd

def extract_user_event_pairs(df_attendees):
    def process_row(row):
        event_id = row.event  # Changed from row['event']
        pairs = []
        for status in ['yes', 'maybe', 'invited', 'no']:
            if pd.notna(getattr(row, status)) and getattr(row, status) != '0':
                users = str(getattr(row, status)).split()
                pairs.extend([
                    {'event_id': event_id, 'user_id': user_id, 'attendance_status': status}
                    for user_id in users if user_id != '0'
                ])
        return pairs

    # Apply the function to each partition
    pairs_ddf = df_attendees.map_partitions(
        lambda df: pd.DataFrame(
            [pair for row in df.itertuples() for pair in process_row(row)]
        )
    )

    return pairs_ddf


In [16]:

# Extract user-event pairs
user_event_pairs = extract_user_event_pairs(df_attendee_new)


In [17]:
user_event_pairs = user_event_pairs.compute()

In [18]:
# Convert types to ensure consistency for joins
user_event_pairs['user_id'] = user_event_pairs['user_id'].astype(str)
user_event_pairs['event_id'] = user_event_pairs['event_id'].astype(str)


In [19]:
user_event_pairs.head()

Unnamed: 0,event_id,user_id,attendance_status
0,1159822043,1975964455,yes
1,1159822043,252302513,yes
2,1159822043,4226086795,yes
3,1159822043,3805886383,yes
4,1159822043,1420484491,yes


In [21]:
len(user_event_pairs[user_event_pairs["attendance_status"]=="invited"])

5722

In [2]:
user_event_pairs = pd.read_csv("/home/nkama/masters_thesis_project/thesis/interactions.csv")
#df_attendee_new = pd.read_csv("/home/nkama/masters_thesis_project/thesis/attendee_new.csv")
user_event_pairs.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status
0,0,1159822043,1975964455,yes
1,1,1159822043,252302513,yes
2,2,1159822043,4226086795,yes
3,3,1159822043,3805886383,yes
4,4,1159822043,1420484491,yes


In [3]:
len(user_event_pairs)

11245010

In [4]:
len(user_event_pairs)

11245010

In [4]:
user_event_pairs['interaction'] = user_event_pairs['attendance_status'].apply(
    lambda x: 1 if x in ['yes', 'maybe'] else (0 if pd.notna(x) else 0)
)

In [5]:
user_event_pairs.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status,interaction
0,0,1159822043,1975964455,yes,1
1,1,1159822043,252302513,yes,1
2,2,1159822043,4226086795,yes,1
3,3,1159822043,3805886383,yes,1
4,4,1159822043,1420484491,yes,1


In [7]:
user_event_pairs[user_event_pairs["interaction"]==0]

Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status,interaction
14,14,1159822043,1723091036,invited,0
15,15,1159822043,3795873583,invited,0
16,16,1159822043,4109144917,invited,0
17,17,1159822043,3560622906,invited,0
18,18,1159822043,3106484834,invited,0
...,...,...,...,...,...
11244993,11244993,2252745757,3789665120,invited,0
11245006,11245006,1187086028,3964194766,invited,0
11245007,11245007,1187086028,1499983746,invited,0
11245008,11245008,1187086028,2432309724,invited,0


In [5]:
user_event_pairs.isnull().sum()

Unnamed: 0           0
event_id             0
user_id              0
attendance_status    0
interaction          0
dtype: int64

In [8]:
len(user_event_pairs)

11245010

In [16]:
# ... existing code ...
# Create attendance_value column based on attendance_status
user_event_pairs['attendance_value'] = user_event_pairs['attendance_status'].apply(lambda x: 1 if x in ['yes', 'maybe'] else 0)
# ... existing code ...

ValueError: Metadata inference failed in `apply`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
TypeError('boolean value of NA is ambiguous')

Traceback:
---------
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/dask/dataframe/utils.py", line 133, in raise_on_meta_error
    yield
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/dask/dataframe/dask_expr/_expr.py", line 4057, in emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/dask/utils.py", line 1226, in __call__
    return getattr(__obj, self.method)(*args, **kwargs)
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/pandas/core/series.py", line 4924, in apply
    ).apply()
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/pandas/core/apply.py", line 1427, in apply
    return self.apply_standard()
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/pandas/core/apply.py", line 1507, in apply_standard
    mapped = obj._map_values(
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/pandas/core/base.py", line 919, in _map_values
    return arr.map(mapper, na_action=na_action)
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/pandas/core/arrays/arrow/array.py", line 1421, in map
    return super().map(mapper, na_action)
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/pandas/core/arrays/base.py", line 2322, in map
    return map_array(self, mapper, na_action=na_action)
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/pandas/core/algorithms.py", line 1743, in map_array
    return lib.map_infer(values, mapper, convert=convert)
  File "lib.pyx", line 2972, in pandas._libs.lib.map_infer
  File "/tmp/ipykernel_257288/1116794417.py", line 3, in <lambda>
    user_event_pairs['attendance_value'] = user_event_pairs['attendance_status'].apply(lambda x: 1 if x in ['yes', 'maybe'] else 0)
  File "missing.pyx", line 392, in pandas._libs.missing.NAType.__bool__


In [13]:
# ... existing code ...
# Create attendance_value column based on yes and maybe from df_attendee_new

# Function to determine attendance value
def determine_attendance_value(row, yes_users, maybe_users):
    if row['user_id'] in yes_users:
        return 1
    elif row['user_id'] in maybe_users:
        return 1
    else:
        return 0

# Convert the yes and maybe columns to sets for faster lookup
yes_users = df_attendee_new['yes'].dropna().str.split().explode().unique()
maybe_users = df_attendee_new['maybe'].dropna().str.split().explode().unique()

# Create a Dask DataFrame from user_event_pairs if it's not already
import dask.dataframe as dd

user_event_pairs = dd.from_pandas(user_event_pairs, npartitions=1)  # Adjust npartitions as needed

# Create attendance_value column in user_event_pairs
user_event_pairs['interaction_type'] = user_event_pairs.apply(
    lambda row: determine_attendance_value(row, yes_users, maybe_users),
    axis=1,
    meta=('attendance_value', 'int')
)

# ... existing code ...

# ... existing code ...

In [25]:
user_event_pairs = user_event_pairs.drop("interaction_type", axis=1)

Interactions Data:
event_id	user_id	attendance_status
0	1159822043	1975964455	yes
1	1159822043	252302513	yes
2	1159822043	4226086795	yes
3	1159822043	3805886383	yes
4	1159822043	1420484491	yes

Attendance Data:
event	yes	maybe	invited	no
0	1159822043	1975964455 252302513 4226086795 3805886383 142...	2733420590 517546982 1350834692 532087573 5831...	1723091036 3795873583 4109144917 3560622906 31...	3575574655 1077296663
1	686467261	2394228942 2686116898 1056558062 3792942231 41...	1498184352 645689144 3770076778 331335845 4239...	1788073374 733302094 1830571649 676508092 7081...	<NA>

In [16]:
len(user_event_pairs)

11245010

In [17]:
user_event_pairs.isnull().sum()

event_id             0
user_id              0
attendance_status    0
dtype: int64

In [18]:
len(user_event_pairs.event_id.unique()), len(user_event_pairs.user_id.unique())

(22710, 3702045)

In [19]:
user_event_pairs.attendance_status.value_counts()


attendance_status
invited    9418711
yes         831137
maybe       520724
no          474438
Name: count, dtype: int64

In [20]:
#save as interactions data
#user_event_pairs.to_csv("interactions.csv")
