In [1]:
import dask.dataframe as dd
import pandas as pd

In [4]:
# df_attendees = dd.read_csv("/home/nkama/masters_thesis_project/thesis/data/event_rec_engine_challenge/event_attendees.csv",
#                        dtype={
#                            'event': 'object'
#                        })

# Read the data using Dask
df_user = dd.read_csv('/content/users.csv',
                      dtype={
                          'user_id': 'object',
                          'location': 'object',
                          'birthyear': 'object',
                          'timezone': 'float64',
                          'locale': 'object',
                          'gender': 'object'
                      })

df_event = dd.read_csv('/content/events.csv',
                 dtype={
                     'event_id': 'object',
                     'city': 'object',
                     'country': 'object',
                     'state': 'object',
                     'zip': 'object'
                 })
new_df = df_event.dropna()

# **Get User-Event Records**

In [None]:
df_attendees.head()

Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
3,2621578336,,,,
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232


In [None]:
df_attendees.isnull().sum().compute()


event         0
yes        1984
maybe      3167
invited    1822
no         6659
dtype: int64

In [None]:
col = ['yes',	'maybe', 'invited',	'no']
df_attendee_new = df_attendees.dropna(subset=col, how='all')


In [None]:
df_attendee_new.head()

Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,0
2,1186208412,0,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232
6,488116622,4145960786 2550625355 2577667841 1575121941 28...,1227223575 2789471603 1323321680 3086272918 38...,1413359297 2300232602 1412759254 617751520 286...,1498160155 3708150269 823488244 3595018395 173...


In [None]:

# Fill remaining NaN values with 0
df_attendee_new = df_attendee_new.fillna('0')


## Count total reactions on each event

We check if the input is '0' and return 0 if it is.

We convert the input to a string before splitting, to handle potential numeric inputs.

We filter out '0' values when counting users.

We renamed 'total_reactions' to 'total_users' for clarity.

These modifications ensure that:

A single '0' in a column is counted as 0 users.

'0' values mixed with other user IDs are not counted.

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np

# Function to count users in a space-separated string
def count_users(x):
    if pd.isna(x) or x == '0':
        return 0
    users = str(x).split()
    return len([u for u in users if u != '0'])

# Create new columns for user counts directly in the original dataframe
for column in ['yes', 'maybe', 'invited', 'no']:
    df_attendee_new[f'{column}_count'] = df_attendee_new[column].apply(count_users, meta=('x', 'int'))

# Calculate total attendees (sum of all categories)
df_attendee_new['total_users'] = df_attendee_new['yes_count'] + \
                              df_attendee_new['maybe_count'] + \
                              df_attendee_new['invited_count'] + \
                              df_attendee_new['no_count']

# Select only the event and count columns
attendance_counts = df_attendee_new[['event', 'yes_count', 'maybe_count', 'invited_count', 'no_count', 'total_users']]


In [None]:
attendance_counts = attendance_counts.compute()


attendance_counts.head()

Unnamed: 0,event,yes_count,maybe_count,invited_count,no_count,total_users
0,1159822043,7,7,70,2,86
1,686467261,11,8,75,0,94
2,1186208412,0,2,2,2,6
4,855842686,6,6,10,1,23
6,488116622,45,38,166,19,268


In [None]:

len(attendance_counts.event) #print number of unique events

#save to csv
#attendance_counts.to_csv("attendance_counts.csv", index=False)


22710

# **Create Interactions Data**

In [None]:
import dask.dataframe as dd
import pandas as pd

def extract_user_event_pairs(df_attendees):
    def process_row(row):
        event_id = row.event  # Changed from row['event']
        pairs = []
        for status in ['yes', 'maybe', 'invited', 'no']:
            if pd.notna(getattr(row, status)) and getattr(row, status) != '0':
                users = str(getattr(row, status)).split()
                pairs.extend([
                    {'event_id': event_id, 'user_id': user_id, 'attendance_status': status}
                    for user_id in users if user_id != '0'
                ])
        return pairs

    # Apply the function to each partition
    pairs_ddf = df_attendees.map_partitions(
        lambda df: pd.DataFrame(
            [pair for row in df.itertuples() for pair in process_row(row)]
        )
    )

    return pairs_ddf

# Extract user-event pairs
user_event_pairs = extract_user_event_pairs(df_attendee_new)

# Convert types to ensure consistency for joins
user_event_pairs['user_id'] = user_event_pairs['user_id'].astype(str)
user_event_pairs['event_id'] = user_event_pairs['event_id'].astype(str)


user_event_pairs = user_event_pairs.compute()


: 

In [None]:
user_event_pairs.head()

Unnamed: 0,event_id,user_id,attendance_status
0,1159822043,1975964455,yes
1,1159822043,252302513,yes
2,1159822043,4226086795,yes
3,1159822043,3805886383,yes
4,1159822043,1420484491,yes


In [None]:
len(user_event_pairs)

11245010

In [None]:
user_event_pairs.isnull().sum()

event_id             0
user_id              0
attendance_status    0
dtype: int64

In [None]:
len(user_event_pairs.event_id.unique()), len(user_event_pairs.user_id.unique())

(22710, 3702045)

In [None]:
user_event_pairs.attendance_status.value_counts()

attendance_status
invited    9418711
yes         831137
maybe       520724
no          474438
Name: count, dtype: int64

In [None]:
#save as interactions data
user_event_pairs.to_csv("interactions.csv")

# **Create Event Data**

In [5]:
df_event.head()

Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,2,...,0,1,0,0,0,0,0,0,0,9
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,2,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,9


In [6]:
new_event_df = df_event.iloc[:,:9]
new_event_df.head()

Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,


In [None]:
len(new_event_df)

3137972

In [7]:
"""
Load the saved user_event_pairs dataframe so we don't have to rerun all the code needed
to create the dataframe should the kernel crash when processing large data volume.
"""

user_event_pairs = pd.read_csv("/content/interactions.csv")

user_event_pairs.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status
0,0,1159822043,1975964455,yes
1,1,1159822043,252302513,yes
2,2,1159822043,4226086795,yes
3,3,1159822043,3805886383,yes
4,4,1159822043,1420484491,yes


In [17]:
# Extract unique event_ids and user_ids
unique_events = user_event_pairs['event_id'].unique()
unique_users = user_event_pairs['user_id'].unique()

# Print the number of unique events and users
print("Number of unique events:", len(unique_events))
print("Number of unique users:", len(unique_users))

# Check data types
print("Data types in user_event_pairs:")
print(user_event_pairs['event_id'].dtype)
print("\nData types in new_event_df:")
print(new_event_df['event_id'].dtype)

#

Number of unique events: 22710
Number of unique users: 3702045


In [None]:

# Convert both to string type to ensure matching
user_event_pairs['event_id'] = user_event_pairs['event_id'].astype(str)
user_event_pairs['user_id'] = user_event_pairs['event_id'].astype(str)
new_event_df['event_id'] = new_event_df['event_id'].astype(str)

In [31]:
# Filter events data
filtered_events_data = df_event[df_event["event_id"].isin(unique_events)].compute()
filtered_events_data.head()

Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,2,...,0,1,0,0,0,0,0,0,0,9
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,2,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,9


In [32]:
# Print the shape of filtered data
print("\nOriginal events shape:", new_event_df.shape[0].compute())  # Add compute() here
print("Filtered events shape:", filtered_events_data.shape)

# Show counts of matching events
print("\nNumber of unique events:", len(unique_events))
print("Number of events in filtered data:", len(filtered_events_data))
print("Number of matching events:", len(set(unique_events).intersection(set(filtered_events_data['event_id']))))

# Optional: Verify no missing events
missing_events = set(unique_events) - set(filtered_events_data['event_id'])
if len(missing_events) == 0:
    print("\nAll events successfully filtered!")
else:
    print(f"\{len(missing_events)} events are missing in the filtered data")


Original events shape: 3137972
Filtered events shape: (22471, 110)

Number of unique events: 22710
Number of events in filtered data: 22471
Number of matching events: 22471
\239 events are missing in the filtered data


In [None]:
filtered_events_data.to_csv("filtered_events_data.csv")


In [None]:
filtered_events_data[["city",	"state",	"zip",	"country",	"lat",	"lng"]].isnull().sum()

In [33]:
filtered_events_data.drop(columns=(["user_id", "state",	"zip",	"country"]), inplace=True)
len(filtered_events_data)

22471

In [35]:
filtered_events_data.dropna(subset=["city",	"lat",	"lng"], inplace=True)
len(filtered_events_data)

11627

In [37]:
filtered_events_data.to_csv("filtered_events_data_no_NAN.csv")

In [38]:
from event_details import synthesize_event_details, add_variety

In [39]:
#events_df = load_events_data('/content/large_col_events.csv')
enriched_events = synthesize_event_details(filtered_events_data)
enriched_events = add_variety(enriched_events)


Processed 147000 events
Processed 164000 events
Processed 32000 events
Processed 55000 events
Processed 53000 events
Processed 128000 events
Processed 82000 events
Processed 27000 events
Processed 49000 events
Processed 128000 events
Processed 140000 events
Processed 146000 events
Processed 168000 events
Processed 77000 events
Processed 115000 events
Processed 45000 events


In [40]:
enriched_events.head()

Unnamed: 0,event_id,start_time,city,lat,lng,c_1,c_2,c_3,c_4,c_5,...,c_95,c_96,c_97,c_98,c_99,c_100,c_other,category,title,description
40,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,0,3,1,1,1,...,0,0,0,0,0,0,28,Sports & Fitness,Sihanoukville {sport} Challenge,Join the regional {sport} community for our Ho...
51,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,5,3,3,1,3,...,0,1,0,0,0,0,103,Music & Concerts,{genre} Album: Holiday Series,Join us for an indie {genre} festival in Palo ...
74,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,5,3,3,1,3,...,0,1,0,0,0,0,103,Business & Networking,Karachi {field} Meetup,Don't miss our industry conference happening i...
156,3580637647,2012-10-22T10:00:00.003Z,Los Angeles,3.156,101.612,0,0,0,0,0,...,0,0,0,0,0,0,6,Health & Wellness,{practice} Class: Healing Strengthening,Don't miss our healing therapy happening in Lo...
178,1924180022,2012-11-11T20:00:00.003Z,Palo Alto,37.416,-122.152,0,0,0,0,0,...,0,0,0,0,0,0,0,Sports & Fitness,Fall {sport} League,Challenge yourself at the Palo Alto {sport} co...


In [41]:
enriched_events.to_csv('enriched_events2.csv', index=False)

In [91]:
len(enriched_events)

11627

In [93]:
enriched_events2 = pd.read_csv("/content/enriched_events2.csv")

In [94]:
enriched_events2.isnull().sum().sum()

0

In [44]:
enriched_events_col = ["event_id",	"start_time",	"city",	"lat",	"lng",	"category",	"title",	"description"]
enriched_events = enriched_events[enriched_events_col]
enriched_events.head()

Unnamed: 0,event_id,start_time,city,lat,lng,category,title,description
40,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,Sports & Fitness,Sihanoukville {sport} Challenge,Join the regional {sport} community for our Ho...
51,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,Music & Concerts,{genre} Album: Holiday Series,Join us for an indie {genre} festival in Palo ...
74,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,Business & Networking,Karachi {field} Meetup,Don't miss our industry conference happening i...
156,3580637647,2012-10-22T10:00:00.003Z,Los Angeles,3.156,101.612,Health & Wellness,{practice} Class: Healing Strengthening,Don't miss our healing therapy happening in Lo...
178,1924180022,2012-11-11T20:00:00.003Z,Palo Alto,37.416,-122.152,Sports & Fitness,Fall {sport} League,Challenge yourself at the Palo Alto {sport} co...


In [65]:
enriched_events.to_csv("events_with_titles.csv")

## Merge with attendees count

In [88]:
#load attendee counts df
attendance_counts = pd.read_csv("attendance_counts.csv")
attendance_counts.head()

Unnamed: 0.1,Unnamed: 0,event,yes_count,maybe_count,invited_count,no_count,total_users
0,0,1159822043,7,7,70,2,86
1,1,686467261,11,8,75,0,94
2,2,1186208412,0,2,2,2,6
3,4,855842686,6,6,10,1,23
4,6,488116622,45,38,166,19,268


In [104]:

# Drop the 'Unnamed: 0' column
#enriched_events2 = attendance_counts.drop(columns=['Unnamed: 0'])

attendance_counts = attendance_counts.rename(columns={'event': 'event_id'})
attendance_counts['event_id'] = attendance_counts['event_id'].astype('object')


# Merge the DataFrames with inner join to keep only common events
events_data = enriched_events2.merge(
    attendance_counts,
    on='event_id',
    how='inner'
)


In [105]:
len(events_data)

11627

In [106]:
events_data.head()

Unnamed: 0,event_id,start_time,city,lat,lng,c_1,c_2,c_3,c_4,c_5,...,c_100,c_other,category,title,description,yes_count,maybe_count,invited_count,no_count,total_users
0,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,0,3,1,1,1,...,0,28,Sports & Fitness,Sihanoukville {sport} Challenge,Join the regional {sport} community for our Ho...,93,65,317,47,522
1,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,5,3,3,1,3,...,0,103,Music & Concerts,{genre} Album: Holiday Series,Join us for an indie {genre} festival in Palo ...,162,112,1021,150,1445
2,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,5,3,3,1,3,...,0,103,Business & Networking,Karachi {field} Meetup,Don't miss our industry conference happening i...,22,8,993,26,1049
3,3580637647,2012-10-22T10:00:00.003Z,Los Angeles,3.156,101.612,0,0,0,0,0,...,0,6,Health & Wellness,{practice} Class: Healing Strengthening,Don't miss our healing therapy happening in Lo...,63,42,430,20,555
4,1924180022,2012-11-11T20:00:00.003Z,Palo Alto,37.416,-122.152,0,0,0,0,0,...,0,0,Sports & Fitness,Fall {sport} League,Challenge yourself at the Palo Alto {sport} co...,41,12,53,1,107


In [111]:
columns = ["event_id",	"start_time",	"city",	"lat",	"lng", "category",
                       "title",	"description",	"yes_count",	"maybe_count",	"invited_count",
                       "no_count",	"total_users"]

In [112]:
events_data = events_data[columns]
events_data.head()

Unnamed: 0,event_id,start_time,city,lat,lng,category,title,description,yes_count,maybe_count,invited_count,no_count,total_users
0,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,Sports & Fitness,Sihanoukville {sport} Challenge,Join the regional {sport} community for our Ho...,93,65,317,47,522
1,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,Music & Concerts,{genre} Album: Holiday Series,Join us for an indie {genre} festival in Palo ...,162,112,1021,150,1445
2,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,Business & Networking,Karachi {field} Meetup,Don't miss our industry conference happening i...,22,8,993,26,1049
3,3580637647,2012-10-22T10:00:00.003Z,Los Angeles,3.156,101.612,Health & Wellness,{practice} Class: Healing Strengthening,Don't miss our healing therapy happening in Lo...,63,42,430,20,555
4,1924180022,2012-11-11T20:00:00.003Z,Palo Alto,37.416,-122.152,Sports & Fitness,Fall {sport} League,Challenge yourself at the Palo Alto {sport} co...,41,12,53,1,107


In [115]:
events_data.to_csv("events_data.csv")

In [119]:
df_user.head()

Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240.0
3,1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0


In [122]:
!git status

fatal: not a git repository (or any of the parent directories): .git


In [120]:
len(df_user)#.info()

38209

In [None]:
c_user = user_df.copy()
c_user.dropna(inplace = True)
len(c_user)

32219

In [None]:
user_df.isnull().sum()

user_id         0
locale          0
birthyear    1492
gender        109
joinedAt       58
location     5465
timezone      436
dtype: int64

In [None]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38209 entries, 0 to 38208
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user_id    38209 non-null  int64  
 1   locale     38209 non-null  object 
 2   birthyear  36717 non-null  object 
 3   gender     38100 non-null  object 
 4   joinedAt   38151 non-null  object 
 5   location   32744 non-null  object 
 6   timezone   37773 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 2.0+ MB


['1335002898',
 '30795658',
 '2947087069',
 '2323335002'

In [None]:
df_user.columns

Index(['user_id', 'locale', 'birthyear', 'gender', 'joinedAt', 'location',
       'timezone'],
      dtype='object')

In [None]:
# import dask.dataframe as dd

# # Read the data using Dask
# df_event = dd.read_csv("/home/nkama/masters_thesis_project/event_rec_engine_challenge/events.csv",
#                       dtype={
#                           'location': 'object',
#                           'birthyear': 'object',
#                           'timezone': 'float64'
#                       })
# df_train = dd.read_csv("/home/nkama/masters_thesis_project/event_rec_engine_challenge/train.csv",
#                        dtype={
#                            'event_id': 'object',  # adjust dtype as needed
#                        })

# # Check column names
# print("Columns in df_user:")
# print(df_event.columns.compute())
# print("\nColumns in df_train:")
# print(df_train.columns.compute())

# # # Filter Japanese users
# # japan_users = df_event[df_user["location"].str.contains("Japan", case=False, na=False)].compute()
# # print("\nUsers from Japan:")
# # print(japan_users)

# # Get common event_ids
# # Convert to pandas for set operations since they're more efficient with smaller data
# train_events = set(df_train['event_id'].compute())
# user_events = set(df_event['event_id'].compute())
# common_events = train_events.intersection(user_events)

# print(f"\nNumber of common event_ids: {len(common_events)}")
# print("\nFirst few common event_ids:")
# print(list(common_events)[:5])

# # Get matching rows
# matching_rows = df_user[df_event['event_id'].isin(list(common_events))].compute()
# print("\nMatching rows from df_user:")
# print(matching_rows)

In [None]:

# # Read the data using Dask
# df_user = dd.read_csv('/home/nkama/masters_thesis_project/event_rec_engine_challenge/users.csv',
#                       dtype = {                     'city': 'object',
#                      'country': 'object',
#                      'state': 'object',
#                      'zip': 'object',
#                      'birthyear': 'object',
#                      'timezone': 'float64'
#                  })
# df_event = dd.read_csv('/home/nkama/masters_thesis_project/event_rec_engine_challenge/events.csv',
#                  dtype={
#                      'city': 'object',
#                      'country': 'object',
#                      'state': 'object',
#                      'zip': 'object'
#                  })


In [None]:
df_train = dd.read_csv("/home/nkama/masters_thesis_project/event_rec_engine_challenge/train.csv",
                       dtype={
                           'event_id': 'object',  # adjust dtype as needed
                       })
df_train.columns = ["user_id", "event_id", "invited", "timestamp", "interested", "not_interested"]


In [None]:


# # Get event_ids and user_ids from train
# train_events = set(df_train['event_id'].compute())
# train_users = set(df_train['user_id'].compute())

# # Create filtered dataframes
# events = df_event[df_event['event_id'].isin(list(train_events))].compute()
# users = df_user[df_user['user_id'].isin(list(train_users))].compute()

# # Print summary statistics
# print(f"Number of events in training set: {len(train_events)}")
# print(f"Number of users in training set: {len(train_users)}")
# print(f"\nShape of filtered events DataFrame: {events.shape}")
# print(f"Shape of filtered users DataFrame: {users.shape}")

# # Show first few rows of each
# print("\nFirst few rows of events DataFrame:")
# print(events.head())
# print("\nFirst few rows of users DataFrame:")
# print(users.head())

In [None]:
len(events)

8846

## Check for common events from both train and test in even_df

In [None]:
df_train.isnull().sum().compute()

user              0
event             0
invited           0
timestamp         0
interested        0
not_interested    0
dtype: int64

In [None]:
len(df_train)

15398

In [None]:
#df_attendees = pd.read_csv("/home/nkama/masters_thesis_project/event_rec_engine_challenge/event_attendees.csv")


Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
3,2621578336,,,,
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232


event         0
yes        1984
maybe      3167
invited    1822
no         6659
dtype: int64

# Interaction data Prep

In [None]:
import dask.dataframe as dd
import pandas as pd

# Function to split space-separated strings into lists and create user-event pairs
def extract_user_event_pairs(df_attendees):
    # Process each row to extract user-event pairs
    pairs_list = []

    # Compute to bring into memory
    #attendees_computed = df_attendees.compute()

    for idx, row in df_attendees.iterrows():
        event_id = row['event']

        for status in ['yes', 'maybe', 'invited', 'no']:
            if pd.notna(row[status]):
                users = row[status].split()
                for user_id in users:
                    pairs_list.append({
                        'event_id': event_id,
                        'user_id': user_id,
                        'attendance_status': status
                    })

    # Convert to pandas DataFrame
    user_event_pairs = pd.DataFrame(pairs_list)

    # Convert back to Dask for consistency
    return dd.from_pandas(user_event_pairs, npartitions=df_attendees.npartitions)

# Rename the column in attendees_df for consistency
#df_attendees = df_attendees.rename(columns={'event': 'event_id'})

# Extract user-event pairs
user_event_pairs = extract_user_event_pairs(df_attendee_new)

# Convert types to ensure consistency for joins
user_event_pairs['user_id'] = user_event_pairs['user_id'].astype(str)
user_event_pairs['event_id'] = user_event_pairs['event_id'].astype(str)


In [None]:
user_event_pair = pd.read_csv("/home/nkama/masters_thesis_project/thesis/user_event_pairs.csv")

user_event_pair.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status
0,0,1159822043,1975964455,yes
1,1,1159822043,252302513,yes
2,2,1159822043,4226086795,yes
3,3,1159822043,3805886383,yes
4,4,1159822043,1420484491,yes


In [None]:
len(user_event_pair.event_id.unique())

16789

In [None]:
len(user_event_pair.user_id.unique())


3448296

In [None]:
user_event_pairs.tail()

Unnamed: 0,event_id,user_id,attendance_status
10206989,1294481466,846734428,invited
10206990,1294481466,2789574470,invited
10206991,1294481466,1080007316,invited
10206992,1294481466,1059016222,invited
10206993,1294481466,3521355909,no


In [None]:
user_event_pairs.compute().to_csv("user_event_pairs.csv")

In [None]:
user_event_pairs.attendance_status.value_counts().compute()

attendance_status
invited    8602420
maybe       459435
no          471604
yes         673535
Name: count, dtype: int64[pyarrow]

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np

# Function to split space-separated strings into lists
def split_users(x):
    if pd.isna(x):
        return []
    return x.split()

# Get unique users from df_attendees
def get_unique_users(events_df):
    all_users = []

    for column in ['yes', 'maybe', 'invited', 'no']:
        users = events_df[column].apply(split_users, meta=('x', 'object')).compute()
        all_users.extend([user for sublist in users if sublist for user in sublist])

    return set(all_users)

# Get unique events from df_attendees
def get_unique_events(events_df):
    return set(events_df['event'].compute().tolist())

# Extract unique users and events
unique_users = get_unique_users(df_attendee_new)
unique_events = get_unique_events(df_attendee_new)


In [None]:

# # Convert to lists for filtering
user_ids_list = list(unique_users)
event_ids_list = list(unique_events)


In [None]:
len(event_ids_list), len(user_ids_list)

(16789, 3448296)

In [None]:

# Filter the user dataframe to include only users from df_attendees
# Note: we need to convert Dask index comparison to explicit isin() for performance
filtered_df_user = df_user[df_user['user_id'].isin(user_ids_list)].compute()


In [None]:

# Filter the event dataframe to include only events from df_attendees
# Note: In df_event, the column is named "event_id" rather than "event"
filtered_df_event = df_event[df_event['event_id'].isin(event_ids_list)].compute()


In [None]:
# Print summary information
print(f"Original users: {len(df_user)}, Filtered users: {len(filtered_df_user)}")
print(f"Original events: {len(df_event)}, Filtered events: {len(filtered_df_event)}")

Original users: 38209, Filtered users: 10727
Original events: 3137972, Filtered events: 2465


In [None]:
# Print summary information
print(f"Original users: {len(df_user)}, Filtered users: {len(filtered_df_user)}")
print(f"Original events: {len(df_event)}, Filtered events: {len(filtered_df_event)}")


Original users: 38209, Filtered users: 10727
Original events: 3137972, Filtered events: 16642


In [None]:
filtered_df_user.to_csv("users_data.csv")


In [None]:
filtered_df_event.to_csv("events_data.csv")

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from dask.distributed import Client

# Initialize Dask client with memory limits
#client = Client(memory_limit='4GB')


attendance_counts.columns = ['event_id',
                             'yes_count',
                             'maybe_count',
                             'invited_count',
                             'no_count',
                             'total_users']
# Ensure event_id types match
attendance_counts['event_id'] = attendance_counts['event_id'].astype('object')

# Merge the dataframes
new_event = filtered_df_event.merge(attendance_counts,
                          left_on='event_id',
                          right_on='event',
                          how='left')

# Compute a sample of the merged data to verify
print("Sample of merged DataFrame:")
print(new_event.head().compute())

# Print the shape of the merged dataframe
print("\nShape of merged DataFrame:")
print(new_event.shape.compute())

# Check for any null values in the merged columns
print("\nNull values in merged columns:")
print(new_event[['yes_count', 'maybe_count', 'invited_count', 'no_count', 'total_users']].isnull().sum().compute())

#client.close()

# Merge and Create event, user and interaction data

## create event data


In [None]:
import pandas as pd

In [None]:
# create event data
events_data = pd.read_csv("/home/nkama/masters_thesis_project/thesis/events_2.csv", usecols=range(10))
events_data.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng
0,0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,
1,1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,
2,2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,
3,5,1212611096,1426522332,2012-11-16T00:00:00.001Z,,,,,,
4,6,3689283674,725266702,2012-11-02T20:00:00.003Z,,,,,,


In [None]:
events_data["start_time"].min()

'2012-05-24T15:00:00.002Z'

In [None]:
# create event data
events_data.drop(columns=["Unnamed: 0","user_id","state","zip",
                          "country"], inplace = True)
len(events_data)

16642

In [None]:
events_data.head()

Unnamed: 0,event_id,start_time,city,lat,lng
0,684921758,2012-10-31T00:00:00.001Z,,,
1,244999119,2012-11-03T00:00:00.001Z,,,
2,3928440935,2012-11-05T00:00:00.001Z,,,
3,1212611096,2012-11-16T00:00:00.001Z,,,
4,3689283674,2012-11-02T20:00:00.003Z,,,


In [None]:
attendance_count = pd.read_csv("/home/nkama/masters_thesis_project/thesis/attendance_counts.csv")
attendance_count = attendance_count.drop(columns=["Unnamed: 0"]).rename(columns={"event": "event_id"})
attendance_count.head()

Unnamed: 0,event_id,yes_count,maybe_count,invited_count,no_count,total_users
0,1159822043,7,7,70,2,86
1,855842686,6,6,10,1,23
2,488116622,45,38,166,19,268
3,1273761447,28,36,194,12,270
4,2688888297,31,34,257,4,326


In [None]:
len(attendance_count)

16789

In [None]:
events = events_data.merge(attendance_count)
events.head()

Unnamed: 0,event_id,start_time,city,lat,lng,yes_count,maybe_count,invited_count,no_count,total_users
0,684921758,2012-10-31T00:00:00.001Z,,,,8,2,57,3,70
1,244999119,2012-11-03T00:00:00.001Z,,,,6,1,18,1,26
2,3928440935,2012-11-05T00:00:00.001Z,,,,42,17,495,2,556
3,1212611096,2012-11-16T00:00:00.001Z,,,,6,2,1426,4,1438
4,3689283674,2012-11-02T20:00:00.003Z,,,,26,16,269,17,328


In [None]:
events.dropna(inplace = True)
events.to_csv("events_data.csv")

In [None]:
events.head()

Unnamed: 0,event_id,start_time,city,lat,lng,yes_count,maybe_count,invited_count,no_count,total_users
19,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,93,65,317,47,522
21,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,162,112,1021,150,1445
36,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,22,8,993,26,1049
80,3580637647,2012-10-22T10:00:00.003Z,Los Angeles,3.156,101.612,63,42,430,20,555
90,1924180022,2012-11-11T20:00:00.003Z,Palo Alto,37.416,-122.152,41,12,53,1,107


In [None]:
len(events.event_id)

9343

In [None]:
!pip install transformers

In [None]:
!pip install nltk

## Create Users data

In [None]:
users = pd.read_csv("/home/nkama/masters_thesis_project/thesis/users_2.csv")
users.head()

Unnamed: 0.1,Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,0,3197468391,id_ID,1993.0,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,1,3537982273,id_ID,1992.0,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,4,3429017717,id_ID,1995.0,female,2012-09-10T16:06:53.132Z,,420.0
3,7,3473687777,id_ID,1965.0,female,2012-10-03T12:19:29.975Z,Medan Indonesia,420.0
4,12,1355996271,id_ID,1993.0,female,2012-10-26T15:34:46.113Z,Djokja Yogyakarta Indonesia,420.0


In [None]:
users["start_time"] = events_data["start_time"]

In [None]:
users.head()

Unnamed: 0.1,Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone,start_time
0,0,3197468391,id_ID,1993.0,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0,2012-10-31T00:00:00.001Z
1,1,3537982273,id_ID,1992.0,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0,2012-11-03T00:00:00.001Z
2,4,3429017717,id_ID,1995.0,female,2012-09-10T16:06:53.132Z,,420.0,2012-11-05T00:00:00.001Z
3,7,3473687777,id_ID,1965.0,female,2012-10-03T12:19:29.975Z,Medan Indonesia,420.0,2012-11-16T00:00:00.001Z
4,12,1355996271,id_ID,1993.0,female,2012-10-26T15:34:46.113Z,Djokja Yogyakarta Indonesia,420.0,2012-11-02T20:00:00.003Z


In [None]:
len(users)

9263

In [None]:
import dask.dataframe as dd
import pandas as pd


In [None]:
from datetime import datetime

# Define function to calculate age
def calculate_age(row):
    """Calculate age based on birthyear and start_time"""
    if pd.isna(row['birthyear']) or pd.isna(row['start_time']):
        return None
    try:
        birth_year = float(row['birthyear'])
        # Extract year from start_time
        start_year = pd.to_datetime(row['start_time']).year
        # Calculate age
        age = start_year - birth_year
        # Validate age is reasonable
        if 0 <= age <= 100:
            return round(age)
        return None
    except (ValueError, TypeError):
        return None


# Apply the function to create the age column
users['age'] = users.apply(calculate_age, axis=1)


In [None]:
users.head()

Unnamed: 0,user_id,locale,gender,joinedAt,location,timezone,age
0,3197468391,id_ID,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0,19.0
1,3537982273,id_ID,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0,20.0
3,3473687777,id_ID,female,2012-10-03T12:19:29.975Z,Medan Indonesia,420.0,47.0
4,1355996271,id_ID,female,2012-10-26T15:34:46.113Z,Djokja Yogyakarta Indonesia,420.0,19.0
5,2411726276,en_US,female,2012-10-30T05:16:27.136Z,Triolet Mauritius,240.0,16.0


In [None]:
users.dropna(inplace = True)
users.drop(columns=(["birthyear","Unnamed: 0","start_time"]), inplace = True)

In [None]:
users.to_csv("users_data.csv")

In [None]:
len(users.location)

9263

In [None]:
!pip install geopy

In [None]:
def add_user_coordinates_scalable(users_df, checkpoint_file='geocoding_progress.pkl'):
    """
    Add latitude and longitude to users based on their existing location data
    Optimized for large datasets with checkpointing and batch processing

    Parameters:
    -----------
    users_df : pandas.DataFrame
        DataFrame containing a 'location' column
    checkpoint_file : str
        File path to save checkpointing data

    Returns:
    --------
    pandas.DataFrame
        Original DataFrame with added 'lat' and 'lng' columns
    """

    import pandas as pd
    import numpy as np
    import os
    import time
    import re
    from geopy.geocoders import Nominatim
    from geopy.extra.rate_limiter import RateLimiter
    from datetime import datetime

    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    # This is critical - ensures we're working with a true copy, not a view
    users_df = users_df.copy()

    start_time = datetime.now()
    print(f"Starting geocoding process at {start_time.strftime('%H:%M:%S')} for {len(users_df)} locations")

    # Load previous progress if available
    if os.path.exists(checkpoint_file):
        try:
            cache_data = pd.read_pickle(checkpoint_file)
            location_cache = cache_data.get('location_cache', {})
            completed_indices = cache_data.get('completed_indices', [])
            print(f"Loaded cache with {len(location_cache)} locations and {len(completed_indices)} completed indices")
        except Exception as e:
            print(f"Error loading checkpoint file: {e}")
            location_cache = {}
            completed_indices = []
    else:
        location_cache = {}
        completed_indices = []

    # Known problematic locations with manual coordinates
    known_locations = {
        "Djokja Yogyakarta Indonesia": (-7.797, 110.370),  # Yogyakarta coordinates
        "Santo Domingo  Dominican Republic": (18.486, -69.932),  # Santo Domingo coordinates
        # Add more problematic locations as you discover them
    }

    # Initialize geocoder with rate limiting
    geolocator = Nominatim(user_agent="geopy_user_locations_batch")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.2)

    def clean_location(location):
        """Thoroughly clean and standardize location strings"""
        if pd.isna(location) or not location or location.isspace():
            return None

        # Convert to string if not already and strip whitespace
        location = str(location).strip()

        # Skip empty strings or just whitespace
        if not location or location.isspace():
            return None

        # Replace multiple spaces with a single space
        location = re.sub(r'\s+', ' ', location)

        # Add commas between city/state/country if missing
        parts = re.split(r'\s{2,}', location)
        if len(parts) > 1:
            location = ", ".join(parts)

        return location

    def get_coordinates(location, attempt_count=0):
        """Get coordinates with fallback mechanisms and retry logic"""
        # First check for None, empty or whitespace-only strings
        if pd.isna(location) or not location or str(location).isspace():
            return None, None

        # Clean the location string
        original_loc = str(location).strip()
        cleaned_loc = clean_location(location)
        if not cleaned_loc:
            return None, None

        # Check if this is a known problematic location with manual coordinates
        if original_loc in known_locations:
            return known_locations[original_loc]

        # Check cache for exact match
        if cleaned_loc in location_cache:
            return location_cache[cleaned_loc]

        # First attempt with cleaned location
        if attempt_count == 0:
            try:
                loc = geocode(cleaned_loc)
                if loc:
                    coords = (round(loc.latitude, 3), round(loc.longitude, 3))
                    location_cache[cleaned_loc] = coords
                    return coords
            except Exception as e:
                print(f"Error geocoding '{cleaned_loc}': {e}")
                time.sleep(1)

        # Second attempt: try alternative formats (city, country)
        if attempt_count <= 1:
            try:
                parts = cleaned_loc.split(',')
                if len(parts) > 1:
                    # Keep first and last parts (typically city and country)
                    simplified = f"{parts[0].strip()}, {parts[-1].strip()}"
                    loc = geocode(simplified)
                    if loc:
                        coords = (round(loc.latitude, 3), round(loc.longitude, 3))
                        location_cache[cleaned_loc] = coords
                        return coords
            except Exception as e:
                print(f"Second attempt error geocoding '{cleaned_loc}': {e}")
                time.sleep(1)

        # Both attempts failed
        return None, None

    # Prepare result DataFrame - properly initialize columns
    # This uses proper DataFrame assignment instead of direct assignment
    if 'lat' not in users_df.columns:
        users_df.loc[:, 'lat'] = np.nan
    if 'lng' not in users_df.columns:
        users_df.loc[:, 'lng'] = np.nan

    # Process in smaller batches to save progress regularly
    BATCH_SIZE = 100
    total_batches = (len(users_df) + BATCH_SIZE - 1) // BATCH_SIZE

    # Skip already processed indices
    pending_indices = [i for i in range(len(users_df)) if i not in completed_indices]

    for batch_idx, batch_start in enumerate(range(0, len(pending_indices), BATCH_SIZE)):
        batch_end = min(batch_start + BATCH_SIZE, len(pending_indices))
        batch_indices = pending_indices[batch_start:batch_end]

        print(f"\nProcessing batch {batch_idx+1}/{total_batches} ({len(batch_indices)} locations)")
        batch_start_time = time.time()
        success_count = 0

        for idx in batch_indices:
            location = users_df.iloc[idx]['location']

            try:
                lat, lng = get_coordinates(location)
                if lat is not None and lng is not None:
                    # Proper way to set values using loc to avoid SettingWithCopyWarning
                    users_df.loc[idx, 'lat'] = lat
                    users_df.loc[idx, 'lng'] = lng
                    success_count += 1
                completed_indices.append(idx)
            except Exception as e:
                print(f"Unexpected error processing row {idx} '{location}': {e}")

            # Save checkpoint every 20 locations
            if len(completed_indices) % 20 == 0:
                checkpoint_data = {
                    'location_cache': location_cache,
                    'completed_indices': completed_indices
                }
                pd.to_pickle(checkpoint_data, checkpoint_file)

        # Save batch checkpoint
        checkpoint_data = {
            'location_cache': location_cache,
            'completed_indices': completed_indices
        }
        pd.to_pickle(checkpoint_data, checkpoint_file)

        batch_time = time.time() - batch_start_time
        print(f"Batch {batch_idx+1} completed: {success_count}/{len(batch_indices)} successful ({batch_time:.1f}s)")
        print(f"Overall progress: {len(completed_indices)}/{len(users_df)} rows processed ({len(completed_indices)/len(users_df)*100:.1f}%)")

        # Estimate remaining time
        if batch_idx > 0 and success_count > 0:
            rows_left = len(users_df) - len(completed_indices)
            time_per_row = batch_time / len(batch_indices)
            est_time_left = rows_left * time_per_row
            est_hours = int(est_time_left // 3600)
            est_minutes = int((est_time_left % 3600) // 60)
            print(f"Estimated time remaining: {est_hours}h {est_minutes}m")

    # Final report
    success_count = users_df['lat'].notna().sum()
    total_time = (datetime.now() - start_time).total_seconds()
    hours = int(total_time // 3600)
    minutes = int((total_time % 3600) // 60)
    seconds = int(total_time % 60)

    print(f"\nGeocoding completed in {hours}h {minutes}m {seconds}s")
    print(f"Successfully geocoded {success_count} of {len(users_df)} locations ({success_count/len(users_df)*100:.1f}%)")

    # Create a report of failed locations
    failed_df = users_df[users_df['lat'].isna()].copy()
    if not failed_df.empty:
        print(f"\nFailed to geocode {len(failed_df)} locations")
        failed_df.to_csv('failed_geocodes.csv', index=False)
        print("Failed locations saved to 'failed_geocodes.csv'")

        # Show sample of failed locations
        sample_size = min(10, len(failed_df))
        print(f"\nSample of failed locations:")
        for loc in failed_df['location'].head(sample_size).tolist():
            print(f"  - '{loc}'")

    return users_df
# Example usage:
users_df = add_user_coordinates_scalable(users[["location"]])





Starting geocoding process at 22:04:58 for 9263 locations
Loaded cache with 24 locations and 40 completed indices

Processing batch 1/93 (100 locations)
Batch 1 completed: 99/100 successful (45.8s)
Overall progress: 140/9271 rows processed (1.5%)

Processing batch 2/93 (100 locations)
Batch 2 completed: 93/100 successful (48.3s)
Overall progress: 240/9279 rows processed (2.6%)
Estimated time remaining: 1h 12m

Processing batch 3/93 (100 locations)
Batch 3 completed: 95/100 successful (30.0s)
Overall progress: 340/9288 rows processed (3.7%)
Estimated time remaining: 0h 44m

Processing batch 4/93 (100 locations)
Batch 4 completed: 92/100 successful (31.2s)
Overall progress: 440/9298 rows processed (4.7%)
Estimated time remaining: 0h 46m

Processing batch 5/93 (100 locations)
Batch 5 completed: 95/100 successful (30.2s)
Overall progress: 540/9309 rows processed (5.8%)
Estimated time remaining: 0h 44m

Processing batch 6/93 (100 locations)
Batch 6 completed: 98/100 successful (26.7s)
Overa

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Cairo 11',), **{}).
Traceback (most recent call last):
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "/home/nkama/.pyenv/versions/3.10.6/envs/thesisenv/lib/python3.10/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "/home/nkama/.pyenv/versions/3.10.6/lib/python3.10/http/client.py", line 1374, in getresponse
    response.begin()
  File "/home/nkama/.pyenv/versions/3.10.6/lib/python3.10/http/client.py", line 318, in begin
    version, status, reason = self._read_status()
  File "/home/nkama/.pyenv/versions/3.10.6/lib/python3.10/http/client.py", line 279, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/nkama/.pyenv/versions/3.10.6/lib/python3.10/socket.py", line 705, in readinto
 

Batch 22 completed: 91/100 successful (38.8s)
Overall progress: 2240/9514 rows processed (23.5%)
Estimated time remaining: 0h 47m

Processing batch 23/93 (100 locations)
Batch 23 completed: 97/100 successful (14.4s)
Overall progress: 2340/9532 rows processed (24.5%)
Estimated time remaining: 0h 17m

Processing batch 24/93 (100 locations)
Batch 24 completed: 98/100 successful (18.0s)
Overall progress: 2440/9541 rows processed (25.6%)
Estimated time remaining: 0h 21m

Processing batch 25/93 (100 locations)
Batch 25 completed: 98/100 successful (19.3s)
Overall progress: 2540/9555 rows processed (26.6%)
Estimated time remaining: 0h 22m

Processing batch 26/93 (100 locations)
Batch 26 completed: 100/100 successful (10.8s)
Overall progress: 2640/9571 rows processed (27.6%)
Estimated time remaining: 0h 12m

Processing batch 27/93 (100 locations)
Batch 27 completed: 96/100 successful (16.7s)
Overall progress: 2740/9586 rows processed (28.6%)
Estimated time remaining: 0h 19m

Processing batch 2

In [None]:
users_df[(users_df["lat"].notna()) & (users_df["lng"].notna())]

Unnamed: 0,location,lat,lng
40,Medan Indonesia,25.074,55.189
42,Phnom Penh 11,-8.584,116.107
43,Hollywood California,-7.801,110.365
44,San Francisco California,3.590,98.674
45,Santiago Chile,11.534,104.880
...,...,...,...
9232,,-7.801,110.365
9242,,45.963,-66.643
9245,,18.486,-69.932
9257,,3.590,98.674


In [None]:
users_data = users.merge(users_df)  # Add indicator to see which rows came from where

# Reset the index of the combined dataframe
users_data = users_data.reset_index(drop=True)

users_data.head()

Unnamed: 0,user_id,locale,gender,joinedAt,location,timezone,age,lat,lng
0,3197468391,id_ID,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0,19.0,,
1,3197468391,id_ID,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0,19.0,,
2,3197468391,id_ID,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0,19.0,,
3,3197468391,id_ID,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0,19.0,,
4,3197468391,id_ID,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0,19.0,,


In [None]:
len(users_data)

4439361

In [None]:
import pandas as pd

# Your locations array
locations = ['Medan Indonesia', 'Djokja Yogyakarta Indonesia', 'Triolet Mauritius',
             'Columbus OH', 'Helsinki Finland', 'South El Monte California']  # etc.

# Create a DataFrame
users_df2 = pd.DataFrame({'location': locations})


# Apply the function
users_df2 = add_user_coordinates(users_df2)



In [None]:
users_df_ = add_user_coordinates(users)

In [None]:
users_df_.head()

NameError: name 'users_df_' is not defined

## Create interactions data

In [None]:
interactions = pd.read_csv("/home/nkama/masters_thesis_project/thesis/user_event_pairs.csv")
interactions.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status
0,0,1159822043,1975964455,yes
1,1,1159822043,252302513,yes
2,2,1159822043,4226086795,yes
3,3,1159822043,3805886383,yes
4,4,1159822043,1420484491,yes


In [None]:
len(interactions.event_id.unique())

16789

In [None]:
train = pd.read_csv("/home/nkama/masters_thesis_project/event_rec_engine_challenge/train.csv")
train.head()

Unnamed: 0,user,event,invited,timestamp,interested,not_interested
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0


In [None]:
len(train)

15398

In [None]:
import pandas as pd
import numpy as np

# Read the DataFrames

df_attendance = pd.read_csv("/home/nkama/masters_thesis_project/thesis/user_event_pairs.csv")
df_interest = pd.read_csv("/home/nkama/masters_thesis_project/event_rec_engine_challenge/train.csv")


# Get unique events from both DataFrames
attendance_events = set(df_attendance['event_id'].unique())
interest_events = set(df_interest['event'].unique())

# Find common events
common_events = attendance_events.intersection(interest_events)

# Get unique users from both DataFrames
attendance_users = set(df_attendance['user_id'].unique())
interest_users = set(df_interest['user'].unique())

# Find common users
common_users = attendance_users.intersection(interest_users)

# Print results
print("\nEvents statistics:")
print(f"Events in attendance DataFrame: {len(attendance_events):,}")
print(f"Events in interest DataFrame: {len(interest_events):,}")
print(f"Common events: {len(common_events):,}")

print("\nUsers statistics:")
print(f"Users in attendance DataFrame: {len(attendance_users):,}")
print(f"Users in interest DataFrame: {len(interest_users):,}")
print(f"Common users: {len(common_users):,}")

# Show sample of common events and users
print("\nSample of common events (first 5):")
print(list(common_events)[:5])
print("\nSample of common users (first 5):")
print(list(common_users)[:5])

# Calculate overlap percentages
event_overlap_percent = (len(common_events) / min(len(attendance_events), len(interest_events))) * 100
user_overlap_percent = (len(common_users) / min(len(attendance_users), len(interest_users))) * 100

print(f"\nEvent overlap percentage: {event_overlap_percent:.2f}%")
print(f"User overlap percentage: {user_overlap_percent:.2f}%")

# Optional: Create DataFrames with only common events/users
common_events_df = df_attendance[df_attendance['event_id'].isin(common_events)]
common_users_df = df_attendance[df_attendance['user_id'].isin(common_users)]

print("\nShape of DataFrame with common events:", common_events_df.shape)
print("Shape of DataFrame with common users:", common_users_df.shape)


Events statistics:
Events in attendance DataFrame: 16,789
Events in interest DataFrame: 8,846
Common events: 6,631

Users statistics:
Users in attendance DataFrame: 3,448,296
Users in interest DataFrame: 2,034
Common users: 778

Sample of common events (first 5):
[np.int64(2168553474), np.int64(2692153353), np.int64(1072889885), np.int64(3783327786), np.int64(4277141562)]

Sample of common users (first 5):
[np.int64(282914818), np.int64(552316934), np.int64(135706634), np.int64(2933561364), np.int64(2624413719)]

Event overlap percentage: 74.96%
User overlap percentage: 38.25%

Shape of DataFrame with common events: (4989801, 4)
Shape of DataFrame with common users: (6718, 4)


In [None]:
len(df_attendees.event.unique())

24144

In [None]:
import pandas as pd
weather = pd.read_csv("/home/nkama/masters_thesis_project/open-meteo-37.43N122.07W11m.csv")
weather.head()

Unnamed: 0,latitude,longitude,elevation,utc_offset_seconds,timezone,timezone_abbreviation
0,37.434093,-122.07446,11.0,-28800,America/Los_Angeles,GMT-8
1,time,temperature_2m (°C),relative_humidity_2m (%),rain (mm),snowfall (cm),
2,2012-02-02T00:00,7.8,91,0.00,0.00,
3,2012-02-02T01:00,7.7,91,0.00,0.00,
4,2012-02-02T02:00,7.8,90,0.00,0.00,


In [None]:
!pip install openmeteo-requests


In [None]:
!pip install requests-cache retry-requests

In [None]:
# Split events dataframe into two halves
def split_events_by_index(df):
    # Calculate middle point
    middle_idx = len(df) // 2

    # Split into two parts
    first_half = df.iloc[:middle_idx]
    second_half = df.iloc[middle_idx:]

    return first_half, second_half

# Split the dataframe
events_first_half, events_second_half = split_events_by_index(events)

# Print information about the splits
print("Original DataFrame shape:", events.shape)
print("First half shape:", events_first_half.shape)
print("Second half shape:", events_second_half.shape)

# Verify no overlap in event_ids
first_half_events = set(events_first_half['event_id'])
second_half_events = set(events_second_half['event_id'])
overlap = first_half_events.intersection(second_half_events)
print("\nNumber of overlapping events:", len(overlap))


NameError: name 'events' is not defined

In [None]:



# Concatenate the two halves back together
df_event_combined = pd.concat([events_first_half, events_second_half], axis=0)

# Reset the index of the combined dataframe
df_event_combined = df_event_combined.reset_index(drop=True)

# Verify the combined dataframe
print("Original DataFrame shape:", events.shape)
print("Combined DataFrame shape:", df_event_combined.shape)

# Verify all event_ids are preserved
original_events = set(events['event_id'])
combined_events = set(events['event_id'])
missing_events = original_events - combined_events

print("\nNumber of missing events:", len(missing_events))
if len(missing_events) == 0:
    print("All events successfully preserved in the combined DataFrame!")
else:
    print("Warning: Some events are missing in the combined DataFrame!")

NameError: name 'events_first_half' is not defined

In [None]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import datetime
import time

def get_weather_for_events(events_df):
    """
    Extract weather data for each event in the dataframe

    Args:
        events_df: DataFrame containing event data with start_time, lat, and lng columns

    Returns:
        DataFrame with weather data columns added
    """
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    openmeteo = openmeteo_requests.Client(session=retry_session)

    # Create a copy of the dataframe to avoid modifying the original
    result_df = events_df.copy()

    # Add empty columns for weather data
    result_df['weather_code'] = None
    result_df['temperature_2m_mean'] = None
    result_df['precipitation_sum'] = None
    result_df['precipitation_hours'] = None
    result_df['wind_speed_10m_max'] = None

    # Process events in batches to avoid overwhelming the API
    batch_size = 50
    num_events = len(events_df)

    for i in range(0, num_events, batch_size):
        batch = events_df.iloc[i:min(i+batch_size, num_events)]
        print(f"Processing batch {i//batch_size + 1}/{(num_events+batch_size-1)//batch_size}")

        # Process each event in the batch
        for idx, event in batch.iterrows():
            # Check if lat and lng are valid
            if pd.isna(event['lat']) or pd.isna(event['lng']):
                print(f"Skipping event {event['event_id']} - missing coordinates")
                continue

            # Parse the start_time to get the date
            try:
                event_time = pd.to_datetime(event['start_time'])
                event_date = event_time.strftime('%Y-%m-%d')
            except:
                print(f"Skipping event {event['event_id']} - invalid date format")
                continue

            # Prepare API parameters
            params = {
                "latitude": event['lat'],
                "longitude": event['lng'],
                "start_date": event_date,
                "end_date": event_date,
                "daily": ["weather_code", "temperature_2m_mean", "precipitation_sum",
                          "precipitation_hours", "wind_speed_10m_max"],
                "timeformat": "unixtime",
                "timezone": "GMT"  # Using GMT as a default
            }

            try:
                # Make API request
                responses = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=params)
                response = responses[0]

                # Process daily data
                daily = response.Daily()

                # Get weather data
                weather_code = daily.Variables(0).ValuesAsNumpy()[0]
                temperature = daily.Variables(1).ValuesAsNumpy()[0]
                precipitation_sum = daily.Variables(2).ValuesAsNumpy()[0]
                precipitation_hours = daily.Variables(3).ValuesAsNumpy()[0]
                wind_speed = daily.Variables(4).ValuesAsNumpy()[0]

                # Store data in result dataframe
                result_df.loc[idx, 'weather_code'] = weather_code
                result_df.loc[idx, 'temperature_2m_mean'] = temperature
                result_df.loc[idx, 'precipitation_sum'] = precipitation_sum
                result_df.loc[idx, 'precipitation_hours'] = precipitation_hours
                result_df.loc[idx, 'wind_speed_10m_max'] = wind_speed

                # Add a small delay to avoid rate limiting
                time.sleep(0.1)

            except Exception as e:
                print(f"Error getting weather for event {event['event_id']}: {e}")

        # Add a delay between batches to avoid rate limiting
        time.sleep(1)

    return result_df



In [None]:
result = get_weather_for_events(events_first_half)


Processing batch 1/94
Processing batch 2/94
Processing batch 3/94
Processing batch 4/94
Processing batch 5/94
Processing batch 6/94
Processing batch 7/94
Processing batch 8/94
Processing batch 9/94
Processing batch 10/94
Processing batch 11/94
Processing batch 12/94
Processing batch 13/94
Processing batch 14/94
Processing batch 15/94
Processing batch 16/94
Processing batch 17/94
Processing batch 18/94
Processing batch 19/94
Processing batch 20/94
Processing batch 21/94
Processing batch 22/94
Processing batch 23/94
Processing batch 24/94
Processing batch 25/94
Processing batch 26/94
Processing batch 27/94
Processing batch 28/94
Processing batch 29/94
Processing batch 30/94
Processing batch 31/94
Processing batch 32/94
Processing batch 33/94
Processing batch 34/94
Processing batch 35/94
Processing batch 36/94
Processing batch 37/94
Processing batch 38/94
Processing batch 39/94
Processing batch 40/94
Processing batch 41/94
Processing batch 42/94
Processing batch 43/94
Processing batch 44/

In [None]:
result_2 = get_weather_for_events(events_second_half)

NameError: name 'get_weather_for_events' is not defined

In [None]:
result.tail()

Unnamed: 0,event_id,start_time,city,lat,lng,yes_count,maybe_count,invited_count,no_count,total_users,weather_code,temperature_2m_mean,precipitation_sum,precipitation_hours,wind_speed_10m_max
8413,27705020,2012-10-30T23:30:00.003Z,Royal Oak,42.49,-83.143,40,47,438,14,539,73.0,3.949833,10.9,15.0,38.936623
8414,1869146953,2012-12-15T00:00:00.001Z,Hamilton,43.257,-79.866,39,23,1218,43,1323,3.0,1.485333,0.0,0.0,22.881956
8416,1142967652,2012-07-24T19:00:00.000Z,Toronto,43.646,-79.394,15,5,1655,44,1719,61.0,24.396254,3.8,2.0,25.623301
8419,1365361942,2012-11-17T03:00:00.003Z,Moreno Valley,33.917,-117.249,47,46,473,20,586,51.0,14.368583,0.5,3.0,8.669949
8421,843844488,2012-09-23T03:00:00.003Z,Medan,3.567,98.696,1278,435,7913,373,9999,63.0,25.790667,6.3,5.0,9.422101


In [None]:
result_2.head()

In [None]:
pd.read_csv("/home/nkama/masters_thesis_project/event_rec_engine_challenge/user_friends.csv").head()

Unnamed: 0,user,friends
0,3197468391,1346449342 3873244116 4226080662 1222907620 54...
1,3537982273,1491560444 395798035 2036380346 899375619 3534...
2,823183725,1484954627 1950387873 1652977611 4185960823 42...
3,1872223848,83361640 723814682 557944478 1724049724 253059...
4,3429017717,4253303705 2130310957 1838389374 3928735761 71...
