In [1]:
import dask.dataframe as dd
import pandas as pd

In [2]:
# Read the data using Dask
df_user = pd.read_csv('/home/nkama/masters_thesis_project/thesis/data/event_rec_engine_challenge/users.csv')
                    

In [3]:
df_user.head()

Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240.0
3,1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0


In [4]:
"""
Load the saved user_event_pairs dataframe so we don't have to rerun all the code needed
to create the dataframe should the kernel crash when processing large data volume.
"""

user_event_pairs = pd.read_csv("/home/nkama/masters_thesis_project/thesis/interactions.csv")

user_event_pairs.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status
0,0,1159822043,1975964455,yes
1,1,1159822043,252302513,yes
2,2,1159822043,4226086795,yes
3,3,1159822043,3805886383,yes
4,4,1159822043,1420484491,yes


In [10]:
unique_users = user_event_pairs['user_id'].unique()

In [11]:
# Convert both to string type to ensure matching
user_event_pairs['user_id'] = user_event_pairs['user_id'].astype(str)
df_user['user_id'] = df_user['user_id'].astype(str)


In [13]:
# Filter users data
filtered_user_data = df_user[df_user["user_id"].isin(unique_users)]
filtered_user_data.head()

Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0
7,3473687777,id_ID,1965,female,2012-10-03T12:19:29.975Z,Medan Indonesia,420.0
12,1355996271,id_ID,1993,female,2012-10-26T15:34:46.113Z,Djokja Yogyakarta Indonesia,420.0


In [14]:
filtered_user_data.isnull().sum()

user_id         0
locale          0
birthyear     234
gender         42
joinedAt       17
location     1429
timezone       38
dtype: int64

In [15]:
filtered_user_data = filtered_user_data.drop(["timezone","locale"],axis=1)

In [16]:
filtered_user_data = filtered_user_data.dropna()
filtered_user_data

Unnamed: 0,user_id,birthyear,gender,joinedAt,location
0,3197468391,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia
1,3537982273,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia
7,3473687777,1965,female,2012-10-03T12:19:29.975Z,Medan Indonesia
12,1355996271,1993,female,2012-10-26T15:34:46.113Z,Djokja Yogyakarta Indonesia
13,2411726276,1996,female,2012-10-30T05:16:27.136Z,Triolet Mauritius
...,...,...,...,...,...
38187,3566369790,1989,male,2012-10-30T08:56:19.611Z,Jogjakarta Indonesia
38190,2630548270,1988,male,2012-10-29T16:02:19.692Z,Yogyakarta
38199,280076614,1995,female,2012-11-22T12:06:08.668Z,Phnom Penh
38203,3890944219,1993,female,2012-11-04T07:18:39.354Z,Kebumen Jawa Tengah Indonesia


In [17]:
len(filtered_user_data)

9922

In [18]:
# Print summary information
print(f"Original users: {len(df_user)}, Filtered users: {len(filtered_user_data)}")

Original users: 38209, Filtered users: 9922


Extract age column from the birtday and event start time

In [19]:
events_data = pd.read_csv("/home/nkama/masters_thesis_project/thesis/data/event_rec_engine_challenge/events.csv",\
                           usecols=range(3))
#events_data['start_time'] = pd.to_datetime(events_data['start_time'], errors='coerce')
events_data = events_data.dropna(subset=['start_time'])

In [20]:
from datetime import datetime


# Create a copy of the filtered data to avoid the warning
filtered_user_data = filtered_user_data.copy()

# Convert birthyear to int and calculate age
filtered_user_data.loc[:, 'birthyear'] = filtered_user_data['birthyear'].astype(int)
# First, convert string timestamps to datetime objects
events_data['start_time'] = pd.to_datetime(events_data['start_time'],errors='coerce')
#filtered_user_data['joinedAt'] = pd.to_datetime(filtered_user_data['joinedAt'])

# Extract reference year - let's use the median event year as our reference point
# This is a balanced approach that represents the "typical" time in the dataset
median_event_date = events_data['start_time'].median()
reference_year = median_event_date.year
filtered_user_data.loc[:, 'age'] = reference_year - filtered_user_data['birthyear']

# Verify the changes
print("Sample of data with age:")
print(filtered_user_data[['user_id', 'birthyear', 'age']].head())

# Check for any invalid ages
invalid_ages = filtered_user_data[(filtered_user_data['age'] < 0) | (filtered_user_data['age'] > 100)]
print(f"\nNumber of invalid ages: {len(invalid_ages)}")



Sample of data with age:
       user_id birthyear age
0   3197468391      1993  19
1   3537982273      1992  20
7   3473687777      1965  47
12  1355996271      1993  19
13  2411726276      1996  16

Number of invalid ages: 30


In [21]:
len(filtered_user_data)

9922

In [22]:
filtered_user_data.isnull().sum()

user_id      0
birthyear    0
gender       0
joinedAt     0
location     0
age          0
dtype: int64

In [23]:
filtered_user_data = filtered_user_data.drop("birthyear",axis=1)


In [24]:
filtered_user_data.head()


Unnamed: 0,user_id,gender,joinedAt,location,age
0,3197468391,male,2012-10-02T06:40:55.524Z,Medan Indonesia,19
1,3537982273,male,2012-09-29T18:03:12.111Z,Medan Indonesia,20
7,3473687777,female,2012-10-03T12:19:29.975Z,Medan Indonesia,47
12,1355996271,female,2012-10-26T15:34:46.113Z,Djokja Yogyakarta Indonesia,19
13,2411726276,female,2012-10-30T05:16:27.136Z,Triolet Mauritius,16


In [25]:
filtered_user_data.reset_index(drop=True)

Unnamed: 0,user_id,gender,joinedAt,location,age
0,3197468391,male,2012-10-02T06:40:55.524Z,Medan Indonesia,19
1,3537982273,male,2012-09-29T18:03:12.111Z,Medan Indonesia,20
2,3473687777,female,2012-10-03T12:19:29.975Z,Medan Indonesia,47
3,1355996271,female,2012-10-26T15:34:46.113Z,Djokja Yogyakarta Indonesia,19
4,2411726276,female,2012-10-30T05:16:27.136Z,Triolet Mauritius,16
...,...,...,...,...,...
9917,3566369790,male,2012-10-30T08:56:19.611Z,Jogjakarta Indonesia,23
9918,2630548270,male,2012-10-29T16:02:19.692Z,Yogyakarta,24
9919,280076614,female,2012-11-22T12:06:08.668Z,Phnom Penh,17
9920,3890944219,female,2012-11-04T07:18:39.354Z,Kebumen Jawa Tengah Indonesia,19


In [113]:
filtered_user_data.to_csv("filtered_user_data.csv")

In [26]:
from get_user_location_data import add_user_coordinates
import pandas as pd


In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
import time


df = filtered_user_data.copy()
# Initialize geocoder
geolocator = Nominatim(user_agent="location_coordinates_finder")

# Function to get latitude and longitude
def get_coordinates(location):
    try:
        # Add small delay to respect geocoding service rate limits
        time.sleep(1)
        location_data = geolocator.geocode(location)
        if location_data:
            return location_data.latitude, location_data.longitude
        else:
            return None, None
    except Exception as e:
        print(f"Error geocoding {location}: {e}")
        return None, None

# Add latitude and longitude columns
df[['lat', 'lon']] = df['location'].apply(lambda x: pd.Series(get_coordinates(x)))

# Display the result
#df

In [102]:
df.head()

Unnamed: 0,user_id,gender,joinedAt,location,age
0,3197468391,male,2012-10-02T06:40:55.524Z,Medan Indonesia,19
1,3537982273,male,2012-09-29T18:03:12.111Z,Medan Indonesia,20
7,3473687777,female,2012-10-03T12:19:29.975Z,Medan Indonesia,47
12,1355996271,female,2012-10-26T15:34:46.113Z,Djokja Yogyakarta Indonesia,19
13,2411726276,female,2012-10-30T05:16:27.136Z,Triolet Mauritius,16


In [106]:
# import the add_user_coordinates function to retrieve user cordinates
from get_user_location_data import add_user_coordinates

In [None]:
users_df = add_user_coordinates(filtered_user_data[["location"]])

In [36]:
filtered_user_data = pd.read_csv("/home/nkama/masters_thesis_project/thesis/filtered_user_data.csv")
users_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/users_location.csv")
event_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/enriched_events_1.csv")

In [32]:
len(filtered_user_data), len(users_df), len(event_df)

(9922, 9922, 4000)

In [37]:
users_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,gender,joinedAt,location,age,lat,lng
0,0,3197468391,male,2012-10-02T06:40:55.524Z,Medan Indonesia,19,3.59,98.674
1,1,3537982273,male,2012-09-29T18:03:12.111Z,Medan Indonesia,20,3.59,98.674
2,2,3473687777,female,2012-10-03T12:19:29.975Z,Medan Indonesia,47,3.59,98.674
3,4,2411726276,female,2012-10-30T05:16:27.136Z,Triolet Mauritius,16,-20.057,57.552
4,5,1091720544,female,2012-10-30T11:55:19.469Z,Plaine Des Papayes Pamplemousses Mauritius,24,-20.063,57.581


In [None]:
users_data = filtered_user_data.merge(users_df).drop("Unnamed: 0",axis=1)

In [17]:
users_data.head()

Unnamed: 0,user_id,gender,joinedAt,location,age,lat,lng
0,3197468391,male,2012-10-02T06:40:55.524Z,Medan Indonesia,19,3.59,98.674
1,3537982273,male,2012-09-29T18:03:12.111Z,Medan Indonesia,20,3.59,98.674
2,3473687777,female,2012-10-03T12:19:29.975Z,Medan Indonesia,47,3.59,98.674
3,1355996271,female,2012-10-26T15:34:46.113Z,Djokja Yogyakarta Indonesia,19,,
4,2411726276,female,2012-10-30T05:16:27.136Z,Triolet Mauritius,16,-20.057,57.552


In [18]:
len(users_data)

9922

In [19]:
users_data.isnull().sum()

user_id       0
gender        0
joinedAt      0
location      0
age           0
lat         398
lng         398
dtype: int64

In [20]:
users_data = users_data.dropna()

In [21]:
len(users_data)

9524

In [22]:
users_data.to_csv("users_data.csv")

## Create user onboarding data

In [41]:
users_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/users_data.csv")
events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/enriched_events_1.csv")
interactions_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/interactions.csv")

In [42]:
len(events_df)

4000

In [56]:
events_df.category.nunique()

12

In [39]:
interactions_df.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status
0,0,1159822043,1975964455,yes
1,1,1159822043,252302513,yes
2,2,1159822043,4226086795,yes
3,3,1159822043,3805886383,yes
4,4,1159822043,1420484491,yes


In [None]:
col = ["event_id",	"start_time","city","lat","lng", "category","title","description","event_type"]
events_df = events_df[col]

interactions = interactions_df.merge(events_df, on='event_id', how='inner')

In [48]:
interactions.head(2)

Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status,start_time,city,lat,lng,category,title,description,event_type
0,4062,1474957705,212309002,yes,2012-08-09T23:30:00.000Z,Toronto,43.676,-79.357,Arts & Culture,"Canvas of Sound: A Night of Art, Music, and Da...","Join us for ""Canvas of Sound,"" an electrifying...",Indoor
1,4063,1474957705,269775255,yes,2012-08-09T23:30:00.000Z,Toronto,43.676,-79.357,Arts & Culture,"Canvas of Sound: A Night of Art, Music, and Da...","Join us for ""Canvas of Sound,"" an electrifying...",Indoor


In [58]:
len(interactions_df)

11245010

In [57]:
len(interactions)

2319318

In [60]:
interactions.category.nunique()

12

In [62]:
import json
user_categories_dict = {}
# Group by user_id and get unique categories
for user_id, group in interactions.groupby('user_id'):
    # Get unique categories as a list
    categories = group['category'].unique().tolist()
    # Filter to only include categories from EVENT_CATEGORIES
    #interests = [cat for cat in categories if cat in EVENT_CATEGORIES]
    # Store as a string representation for now
    user_categories_dict[user_id] = json.dumps(categories)

In [None]:
for i in user_categories_dict:
    if len(user_categories_dict[i]) >= 2:
        print(user_categories_dict)

In [54]:
categories

['Seasonal & Festivals', 'Health & Wellness', 'Music & Concerts']

In [None]:
import pandas as pd
import json

# Define the event categories
EVENT_CATEGORIES = [
    'Music & Concerts',      # Covers live music, DJ performances, outdoor concerts
    'Food & Drink',          # Covers wine/beer tastings, food festivals, mixology, food truck rallies
    'Education & Learning',  # Covers workshops, educational events, book launches
    'Sports & Fitness',      # Covers sports viewing, fitness challenges
    'Arts & Culture',        # Covers art exhibits, photography, fashion, poetry, cultural celebrations
    'Business & Networking', # Covers networking, industry roundtables, product launches
    'Technology',            # Covers tech conferences, gaming tournaments
    'Community & Causes',    # Covers charity fundraisers, community gatherings, farmers markets
    'Health & Wellness',     # Covers wellness retreats, cooking workshops (health-focused)
    'Entertainment',         # Covers comedy, trivia, karaoke, film screenings
    'Seasonal & Festivals',  # Covers seasonal celebrations, street festivals, craft fairs
    'Immersive Experiences'  # Covers immersive events that don't fit other categories
]

def get_user_interests(interactions_df):
    """
    Get unique event categories for each user and determine their interests.
    
    Parameters:
    - interactions_df: DataFrame containing user-event interactions with category information
    
    Returns:
    - DataFrame with original data plus user_interests column
    """
    # Create a copy of the dataframe to avoid modifying the original
    df = interactions_df.copy()
    
    # Step 1: Group by user_id and collect unique categories
    # Convert to strings first to avoid errors with lists
    user_categories_dict = {}
    
    # Group by user_id and get unique categories
    for user_id, group in df.groupby('user_id'):
        # Get unique categories as a list
        categories = group['category'].unique().tolist()
        # Filter to only include categories from EVENT_CATEGORIES
        interests = [cat for cat in categories if cat in EVENT_CATEGORIES]
        # Store as a string representation for now
        user_categories_dict[user_id] = json.dumps(interests)
    
    # Step 2: Add user_interests to the original dataframe
    # Create a function to map user_id to interests
    def get_interests(user_id):
        interests_json = user_categories_dict.get(user_id, '[]')
        return json.loads(interests_json)
    
    # Apply the function to create the user_interest column
    df['user_interest_str'] = df['user_id'].apply(lambda x: user_categories_dict.get(x, '[]'))
    df['user_interest'] = df['user_interest_str'].apply(json.loads)
    
    # Remove the intermediate string column if you don't need it
    df = df.drop(columns=['user_interest_str'])
    
    return df

def analyze_user_interests(df):
    """
    Analyze and print statistics about user interests.
    
    Parameters:
    - df: DataFrame with user_interest column
    """
    # Count the number of unique users
    unique_users = df['user_id'].nunique()
    
    # Create a temporary dataframe with user_id and a stringified version of interests
    temp_df = df[['user_id', 'user_interest']].drop_duplicates('user_id')
    
    # Count users with interests
    users_with_interests = sum(temp_df['user_interest'].apply(lambda x: len(x) > 0))
    
    # Calculate most common interests by flattening all interest lists
    all_interests = []
    for interests in temp_df['user_interest']:
        all_interests.extend(interests)
    
    # Count occurrences of each interest
    interest_counts = pd.Series(all_interests).value_counts()
    
    print(f"Total unique users: {unique_users}")
    print(f"Users with identified interests: {users_with_interests}")
    print("\nMost common user interests:")
    print(interest_counts)
    
    return interest_counts

def create_user_interests_summary(df):
    """
    Create a summarized dataframe of user interests without list objects
    
    Parameters:
    - df: DataFrame with user_interest column (containing lists)
    
    Returns:
    - DataFrame with user_id and interest counts
    """
    # Get unique users
    user_df = df[['user_id', 'user_interest']].drop_duplicates('user_id')
    
    # Create summary statistics
    summary_data = []
    
    for _, row in user_df.iterrows():
        user_id = row['user_id']
        interests = row['user_interest']
        
        # Create a row for the summary
        user_summary = {
            'user_id': user_id,
            'num_interests': len(interests),
            'interests_joined': ', '.join(interests)
        }
        
        # Add individual interest flags
        for category in EVENT_CATEGORIES:
            user_summary[f'likes_{category.lower().replace(" & ", "_").replace(" ", "_")}'] = int(category in interests)
        
        summary_data.append(user_summary)
    
    return pd.DataFrame(summary_data)

# if __name__ == "__main__":
#     # Create and process the expanded sample data
#     sample_df = create_expanded_sample()
#     print("Sample User-Event Interactions:")
#     print(sample_df)
    
#     # Process the sample data
#     enriched_df = get_user_interests(sample_df)
    
#     # Display the results - avoiding unique() operations on list columns
#     print("\nEnriched DataFrame with User Interests:")
#     for user_id in enriched_df['user_id'].unique():
#         user_data = enriched_df[enriched_df['user_id'] == user_id].iloc[0]
#         print(f"User {user_id}: {user_data['user_interest']}")
    
#     # Create a summarized view for analysis that avoids list objects
#     user_summary = create_user_interests_summary(enriched_df)
#     print("\nUser Interests Summary (hashable format):")
#     print(user_summary)
    
#     # Analyze user interests using our safe method
#     print("\nUser Interest Analysis:")
#     interest_stats = analyze_user_interests(enriched_df)