# **Create Event Data**

In [2]:
# Importing dask and pandas 
import dask.dataframe as dd # for large datasets
import pandas as pd


In [6]:
# Loading the events data from events.csv file using dask
df_event = dd.read_csv('/home/nkama/masters_thesis_project/thesis/data/event_rec_engine_challenge/events.csv',
                 dtype={
                     'event_id': 'object',
                     'city': 'object',
                     'country': 'object',
                     'state': 'object',
                     'zip': 'object'
                 })

In [7]:
df_event.head()

Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,2,...,0,1,0,0,0,0,0,0,0,9
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,2,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,9


The idea here is to select from the event dataframe events that are also find in the
attendees data file. This is to ensure that we collect events with records of some interactions (attendance information) with users.

Since the dataset is very large, we can work with only the first 9 columns of the event data for now to reduce computational cost. Unfortunately, the data does't have event title and description which we also need int the project and so we will use the rest of the columns which are bag of words representation of the events title and descriptions will be used later on to synthesize the event title and descriptions.

In [8]:
# loading the first 9 columns of the event data
new_event_df = df_event.iloc[:,:9]
new_event_df.head()


Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,


In [7]:
len(new_event_df) # check the length


3137972

In [12]:
# Load the interactions data to be used to filter events fromt the event data.

user_event_pairs = pd.read_csv("/home/nkama/masters_thesis_project/thesis/interactions.csv")

user_event_pairs.head()


Unnamed: 0.1,Unnamed: 0,event_id,user_id,attendance_status
0,0,1159822043,1975964455,yes
1,1,1159822043,252302513,yes
2,2,1159822043,4226086795,yes
3,3,1159822043,3805886383,yes
4,4,1159822043,1420484491,yes


In [13]:
# Extract unique event_ids and user_ids
unique_events = user_event_pairs['event_id'].unique()

# Print the number of unique events 
print("Number of unique events:", len(unique_events))

# Check data types
print("Data types in user_event_pairs:")
print(user_event_pairs['event_id'].dtype)
print("\nData types in new_event_df:")
print(new_event_df['event_id'].dtype)

Number of unique events: 22710
Data types in user_event_pairs:
int64

Data types in new_event_df:
string


In [14]:

# Convert both to string type to ensure matching
user_event_pairs['event_id'] = user_event_pairs['event_id'].astype(str)
new_event_df['event_id'] = new_event_df['event_id'].astype(str)


In [None]:
# Filter events data
filtered_events_data = df_event[df_event["event_id"].isin(unique_events)].compute()
filtered_events_data.head()


In [4]:
filtered_events_data = pd.read_csv("/home/nkama/masters_thesis_project/thesis/filtered_events_data (1).csv")
filtered_events_data.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,...,0,1,0,0,0,0,0,0,0,9
1,1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,...,0,0,0,0,0,0,0,0,0,7
2,2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,...,0,0,0,0,0,0,0,0,0,12
3,3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,...,0,0,0,0,0,0,0,0,0,8
4,4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,...,0,0,0,0,0,0,0,0,0,9


In [9]:
# Print the shape of filtered data
print("\nOriginal events shape:", new_event_df.shape[0].compute())
print("Filtered events shape:", filtered_events_data.shape)


Original events shape: 3137972
Filtered events shape: (22471, 111)


In [15]:
# Show counts of matching events
print("\nNumber of unique events:", len(unique_events))
#print("Number of events in filtered data:", len(filtered_events_data))
print("Number of matching events:", len(set(unique_events).intersection(set(filtered_events_data['event_id']))))


Number of unique events: 22710
Number of matching events: 22471


In [16]:
# Check for missing values in columns
filtered_events_data[["city",	"state",	"zip",	"country",	"lat",	"lng"]].isnull().sum()

city       10782
state      13540
zip        18941
country    10711
lat         8135
lng         8135
dtype: int64

In [None]:
#  Drop irrelevant columns 
filtered_events_data.drop(columns=(["user_id", "state",	"zip",	"country"]), inplace=True)
print(len(filtered_events_data))
filtered_events_data.head()

22471


Unnamed: 0.1,Unnamed: 0,event_id,start_time,city,lat,lng,c_1,c_2,c_3,c_4,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,0,684921758,2012-10-31T00:00:00.001Z,,,,2,0,2,0,...,0,1,0,0,0,0,0,0,0,9
1,1,244999119,2012-11-03T00:00:00.001Z,,,,2,0,2,0,...,0,0,0,0,0,0,0,0,0,7
2,2,3928440935,2012-11-05T00:00:00.001Z,,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12
3,3,2582345152,2012-10-30T00:00:00.001Z,,,,1,0,2,1,...,0,0,0,0,0,0,0,0,0,8
4,4,1051165850,2012-09-27T00:00:00.001Z,,,,1,1,0,0,...,0,0,0,0,0,0,0,0,0,9


In [21]:
# Drop missing values.
filtered_events_data.dropna(subset=["city",	"lat",	"lng"], inplace=True)
len(filtered_events_data)

11627

In [22]:
# Save filtered events data to csv
filtered_events_data.to_csv("filtered_events_data.csv")

In [24]:
# event_derails script to generate event titlw, description and category.
from event_details import synthesize_event_details, add_variety

In [25]:
#events_df = load_events_data('/content/large_col_events.csv')
enriched_events = synthesize_event_details(filtered_events_data)
enriched_events = add_variety(enriched_events)


Processed 3000 events
Processed 6000 events
Processed 7000 events
Processed 8000 events
Processed 9000 events
Processed 10000 events
Processed 12000 events
Processed 13000 events
Processed 14000 events
Processed 18000 events
Processed 21000 events
Processed 22000 events


In [27]:
enriched_events.head(3)

Unnamed: 0.1,Unnamed: 0,event_id,start_time,city,lat,lng,c_1,c_2,c_3,c_4,...,c_95,c_96,c_97,c_98,c_99,c_100,c_other,category,title,description
40,40,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,0,3,1,1,...,0,0,0,0,0,0,28,Technology,Smart {tech_area} Session,We're excited to bring you this Summer meetup ...
51,51,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,5,3,3,1,...,0,1,0,0,0,0,103,Arts & Culture,Palo Alto {art_form} Collection,We're excited to bring you this Summer install...
74,74,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,5,3,3,1,...,0,1,0,0,0,0,103,Arts & Culture,Karachi {art_form} Museum,We're excited to bring you this Weekend showin...


In [29]:
enriched_events.to_csv('enriched_events.csv', index=False)

In [28]:
len(enriched_events)


11627

In [32]:
enriched_events = pd.read_csv("/home/nkama/masters_thesis_project/thesis/enriched_events.csv")
enriched_events.isnull().sum().sum()

np.int64(0)

In [33]:

enriched_events_col = ["event_id",	"start_time",	"city",	"lat",	"lng",	"category",	"title",	"description"]
enriched_events = enriched_events[enriched_events_col]
enriched_events.head()


Unnamed: 0,event_id,start_time,city,lat,lng,category,title,description
0,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,Technology,Smart {tech_area} Session,We're excited to bring you this Summer meetup ...
1,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,Arts & Culture,Palo Alto {art_form} Collection,We're excited to bring you this Summer install...
2,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,Arts & Culture,Karachi {art_form} Museum,We're excited to bring you this Weekend showin...
3,3580637647,2012-10-22T10:00:00.003Z,Los Angeles,3.156,101.612,Arts & Culture,{art_form} Gallery: Traditional Feature,We're excited to bring you this Weekend displa...
4,1924180022,2012-11-11T20:00:00.003Z,Palo Alto,37.416,-122.152,Entertainment,Live {genre} Event,Be part of for this featured tour in Palo Alto...


In [34]:
enriched_events.to_csv("events_with_titles.csv")


## Merge with attendees count


In [None]:
#load attendee counts df
attendance_counts = pd.read_csv("attendance_counts.csv")
attendance_counts.head()

In [None]:

# Drop the 'Unnamed: 0' column
#enriched_events2 = attendance_counts.drop(columns=['Unnamed: 0'])

attendance_counts = attendance_counts.rename(columns={'event': 'event_id'})
attendance_counts['event_id'] = attendance_counts['event_id'].astype('object')


# Merge the DataFrames with inner join to keep only common events
events_data = enriched_events.merge(
    attendance_counts,
    on='event_id',
    how='inner'
)


In [None]:

len(events_data)


In [None]:
events_data.head()


In [None]:
columns = ["event_id",	"start_time",	"city",	"lat",	"lng", "category",
                       "title",	"description",	"yes_count",	"maybe_count",	"invited_count",
                       "no_count",	"total_users"]
events_data = events_data[columns]
events_data.head()


In [None]:
events_data.to_csv("events_data.csv") # Save data to csv.


## Add Weather data to the event data


In [None]:
!pip install openmeteo-requests
!pip install requests-cache retry-requests


In [23]:
# loading the saved events data
events_data = pd.read_csv("/home/nkama/masters_thesis_project/thesis/events_data.csv").drop(columns=["Unnamed: 0"])
print(len(events_data))
events_data.head()

11627


Unnamed: 0,event_id,start_time,city,lat,lng,category,title,description,yes_count,maybe_count,invited_count,no_count,total_users
0,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,Sports & Fitness,Sihanoukville {sport} Challenge,Join the regional {sport} community for our Ho...,93,65,317,47,522
1,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,Music & Concerts,{genre} Album: Holiday Series,Join us for an indie {genre} festival in Palo ...,162,112,1021,150,1445
2,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,Business & Networking,Karachi {field} Meetup,Don't miss our industry conference happening i...,22,8,993,26,1049
3,3580637647,2012-10-22T10:00:00.003Z,Los Angeles,3.156,101.612,Health & Wellness,{practice} Class: Healing Strengthening,Don't miss our healing therapy happening in Lo...,63,42,430,20,555
4,1924180022,2012-11-11T20:00:00.003Z,Palo Alto,37.416,-122.152,Sports & Fitness,Fall {sport} League,Challenge yourself at the Palo Alto {sport} co...,41,12,53,1,107


To get weather data, we have to make api calls to the Open-Meteo weather API. But first, we split the events data into three. this is to ensure that we meet the rate limit of max 5000 calls within an hour.


In [5]:
# importing the script with functions to split dataframe, retreive weather info and concat all splits  
from weather_data import split_eventsdf_into_three, get_weather_for_events, concat_all_splits

In [6]:
# Split the event_data into three
first_split, second_split, last_split = split_eventsdf_into_three(events_data)
len(first_split), len(second_split), len(last_split)

(3875, 3875, 3877)

In [1]:
# Confirm complete length (11627)
3875+ 3875+ 3877


11627

In [39]:
first_split.head(3)

Unnamed: 0,event_id,start_time,city,lat,lng,category,title,description,yes_count,maybe_count,invited_count,no_count,total_users
0,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,Sports & Fitness,Sihanoukville {sport} Challenge,Join the regional {sport} community for our Ho...,93,65,317,47,522
1,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,Music & Concerts,{genre} Album: Holiday Series,Join us for an indie {genre} festival in Palo ...,162,112,1021,150,1445
2,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,Business & Networking,Karachi {field} Meetup,Don't miss our industry conference happening i...,22,8,993,26,1049


Now we run the get_weather_for_events function to retrieve weather data for each event from all three dataframe splits.


In [None]:
# Get weather information for the first split
first_split_weather = get_weather_for_events(first_split)
first_split_weather.head()

In [None]:
# Save first split with weather information to csv
first_split_weather.to_csv("first_split_weather.csv")

In [40]:
# Get weather information for the second split
second_split_weather = get_weather_for_events(second_split)
second_split_weather.to_csv("second_split_weather.csv")

Processing batch 1/78
Processing batch 2/78
Processing batch 3/78
Processing batch 4/78
Processing batch 5/78
Processing batch 6/78
Processing batch 7/78
Processing batch 8/78
Processing batch 9/78
Processing batch 10/78
Processing batch 11/78
Processing batch 12/78
Processing batch 13/78
Processing batch 14/78
Processing batch 15/78
Processing batch 16/78
Processing batch 17/78
Processing batch 18/78
Processing batch 19/78
Processing batch 20/78
Processing batch 21/78
Processing batch 22/78
Processing batch 23/78
Processing batch 24/78
Processing batch 25/78
Processing batch 26/78
Processing batch 27/78
Processing batch 28/78
Processing batch 29/78
Processing batch 30/78
Processing batch 31/78
Processing batch 32/78
Processing batch 33/78
Processing batch 34/78
Processing batch 35/78
Processing batch 36/78
Processing batch 37/78
Processing batch 38/78
Processing batch 39/78
Processing batch 40/78
Processing batch 41/78
Processing batch 42/78
Processing batch 43/78
Processing batch 44/

: 

In [8]:
# Get weather information for the last split
last_split_weather = get_weather_for_events(last_split)
# Save last split to a csv file
last_split_weather.to_csv("last_split_weather.csv")

Processing batch 1/78
Processing batch 2/78
Processing batch 3/78
Processing batch 4/78
Processing batch 5/78
Processing batch 6/78
Processing batch 7/78
Processing batch 8/78
Processing batch 9/78
Processing batch 10/78
Processing batch 11/78
Processing batch 12/78
Processing batch 13/78
Processing batch 14/78
Processing batch 15/78
Processing batch 16/78
Processing batch 17/78
Processing batch 18/78
Processing batch 19/78
Processing batch 20/78
Processing batch 21/78
Processing batch 22/78
Processing batch 23/78
Processing batch 24/78
Processing batch 25/78
Processing batch 26/78
Processing batch 27/78
Processing batch 28/78
Processing batch 29/78
Processing batch 30/78
Processing batch 31/78
Processing batch 32/78
Processing batch 33/78
Processing batch 34/78
Processing batch 35/78
Processing batch 36/78
Processing batch 37/78
Processing batch 38/78
Processing batch 39/78
Processing batch 40/78
Processing batch 41/78
Processing batch 42/78
Processing batch 43/78
Processing batch 44/

In [12]:
first_split_weather = pd.read_csv("/home/nkama/masters_thesis_project/thesis/first_split_weather.csv")
second_split_weather = pd.read_csv("/home/nkama/masters_thesis_project/thesis/second_split_weather.csv")

In [13]:
# Concatenate all splits with the weather information
complete_event_data = concat_all_splits(first_split_weather,second_split_weather,last_split_weather)
complete_event_data.head()


Unnamed: 0.1,Unnamed: 0,event_id,start_time,city,lat,lng,category,title,description,yes_count,maybe_count,invited_count,no_count,total_users,weather_code,temperature_2m_mean,precipitation_sum,precipitation_hours,wind_speed_10m_max
0,0.0,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,Sports & Fitness,Sihanoukville {sport} Challenge,Join the regional {sport} community for our Ho...,93,65,317,47,522,51.0,28.318502,0.1,1.0,10.97262
1,1.0,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,Music & Concerts,{genre} Album: Holiday Series,Join us for an indie {genre} festival in Palo ...,162,112,1021,150,1445,3.0,18.417002,0.0,0.0,9.826088
2,2.0,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,Business & Networking,Karachi {field} Meetup,Don't miss our industry conference happening i...,22,8,993,26,1049,3.0,28.83725,0.0,0.0,30.421598
3,3.0,3580637647,2012-10-22T10:00:00.003Z,Los Angeles,3.156,101.612,Health & Wellness,{practice} Class: Healing Strengthening,Don't miss our healing therapy happening in Lo...,63,42,430,20,555,61.0,24.851418,7.200001,12.0,6.989936
4,4.0,1924180022,2012-11-11T20:00:00.003Z,Palo Alto,37.416,-122.152,Sports & Fitness,Fall {sport} League,Challenge yourself at the Palo Alto {sport} co...,41,12,53,1,107,3.0,10.835751,0.0,0.0,10.495713


In [14]:
len(complete_event_data)

11627

In [None]:
!pip install scikit-learn

In [None]:
!pip install openai

In [None]:
!pip install litellm

In [2]:
import os

In [43]:
import logging
# Configure logging to only show warnings and above
logging.getLogger("litellm").setLevel(logging.WARNING)
logging.getLogger().setLevel(logging.WARNING)

from litellm import completion

import pandas as pd
import numpy as np
import random
from datetime import datetime
import string
#import openai  # Make sure to import openai for the GPT API calls
# First check if the API keys exist before setting them
openai_key = os.getenv("OPENAI_API_KEY")
groq_key = os.getenv("GROQ_API_KEY")

OPENAI_API_KEY=openai_key
GROQ_API_KEY=groq_key

# Function to load your events data
def load_events_data(file_path):
    events_df = pd.read_csv(file_path)
    return events_df

# Define event categories - these will be assigned based on patterns rather than specific words
EVENT_CATEGORIES = [
    'Music & Concerts',      # Covers live music, DJ performances, outdoor concerts
    'Food & Drink',          # Covers wine/beer tastings, food festivals, mixology, food truck rallies
    'Education & Learning',  # Covers workshops, educational events, book launches
    'Sports & Fitness',      # Covers sports viewing, fitness challenges
    'Arts & Culture',        # Covers art exhibits, photography, fashion, poetry, cultural celebrations
    'Business & Networking', # Covers networking, industry roundtables, product launches
    'Technology',            # Covers tech conferences, gaming tournaments
    'Community & Causes',    # Covers charity fundraisers, community gatherings, farmers markets
    'Health & Wellness',     # Covers wellness retreats, cooking workshops (health-focused)
    'Entertainment',         # Covers comedy, trivia, karaoke, film screenings
    'Seasonal & Festivals',  # Covers seasonal celebrations, street festivals, craft fairs
    'Immersive Experiences'  # Covers immersive events that don't fit other categories
]

# Function to determine category based on count patterns rather than specific words
def determine_category(event_row):
    """
    Determine event category based on frequency distribution patterns,
    time of year, and other event attributes
    """
    # Extract count values
    count_columns = [col for col in event_row.index if col.startswith('count_') and col != 'count_other']
    counts = [event_row[col] for col in count_columns]
    
    # Get time/date information if available
    start_time = event_row.get('start_time', None)
    month = None
    hour = None
    
    if start_time and not pd.isna(start_time):
        try:
            dt = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
            month = dt.month  # 1-12
            hour = dt.hour    # 0-23
        except (ValueError, AttributeError):
            pass
    
    # Check if it might be a seasonal event
    seasonal_months = {
        'winter': [12, 1, 2],
        'spring': [3, 4, 5],
        'summer': [6, 7, 8],
        'fall': [9, 10, 11],
        'holiday': [11, 12]  # November, December for holiday events
    }
    
    # Special dates - could be expanded with more specific holiday checks
    is_seasonal_event = False
    if month:
        if month in seasonal_months['holiday']:
            is_seasonal_event = random.random() < 0.4  # 40% chance if in holiday months
        elif month in seasonal_months['summer']:
            is_seasonal_event = random.random() < 0.3  # 30% chance in summer
        else:
            is_seasonal_event = random.random() < 0.1  # 10% chance in other seasons
    
    # Evening events are more likely to be entertainment, music, food & drink
    is_evening_event = hour >= 18 if hour is not None else False
    
    # If it's determined to be a seasonal event
    if is_seasonal_event:
        return 'Seasonal & Festivals'
    
    # If we have no counts data, use time-based heuristics
    if not counts:
        if is_evening_event:
            return random.choice([
                'Music & Concerts', 
                'Entertainment', 
                'Food & Drink',
                'Arts & Culture'
            ])
        else:
            return random.choice(EVENT_CATEGORIES)
    
    # Analyze patterns in the counts
    total_words = sum(counts) + event_row.get('count_other', 0)
    num_unique_words = sum(1 for count in counts if count > 0)
    max_count = max(counts) if counts else 0
    concentration = max_count / total_words if total_words > 0 else 0
    
    # Heuristic categorization based on word patterns and time
    if num_unique_words > 15 and concentration < 0.15:
        # Diverse vocabulary, lower concentration
        if is_evening_event:
            return random.choice([
                'Business & Networking', 
                'Technology', 
                'Community & Causes',
                'Immersive Experiences'
            ])
        else:
            return random.choice([
                'Education & Learning', 
                'Business & Networking', 
                'Technology',
                'Community & Causes'
            ])
    elif concentration > 0.3:
        # High concentration on few words
        if is_evening_event:
            return random.choice([
                'Music & Concerts', 
                'Entertainment', 
                'Immersive Experiences'
            ])
        else:
            return random.choice([
                'Sports & Fitness', 
                'Music & Concerts',
                'Community & Causes'
            ])
    elif 0.2 <= concentration <= 0.3 and num_unique_words < 10:
        # Medium concentration, fewer unique words
        if is_evening_event:
            return random.choice([
                'Food & Drink', 
                'Arts & Culture', 
                'Entertainment'
            ])
        else:
            return random.choice([
                'Food & Drink', 
                'Arts & Culture', 
                'Health & Wellness'
            ])
    else:
        # Default to random selection with probability weights
        # Updated weights for all 12 categories
        return random.choices(
            EVENT_CATEGORIES,
            weights=[0.15, 0.12, 0.08, 0.08, 0.10, 0.08, 0.06, 0.08, 0.08, 0.07, 0.05, 0.05],
            k=1
        )[0]

# Create a function to format the time
def format_time(time_str):
    """Format ISO time string to readable time"""
    if pd.isna(time_str):
        return "TBD"
    
    try:
        dt = datetime.fromisoformat(time_str.replace('Z', '+00:00'))
        return dt.strftime("%A, %B %d at %-I:%M %p")  # e.g., "Monday, January 15 at 7:30 PM"
    except:
        return "TBD"

def llm_response(prompt, llm_model="llama3-8b-8192"):
    response = completion(
        model=f"groq/{llm_model}", 
        messages=[
            {"role": "user", "content": prompt}
        ],
    )
    return response.choices[0].message.content

def generate_llm_title(category, event_row):
    """Generate event title using an LLM"""
    city = event_row.get('city', 'Local')
    state = event_row.get('state', '')
    location = f"{city}, {state}" if state else city
    
    # Create a prompt for title generation
    title_prompt = f"""
    Generate a catchy, attention-grabbing title for a '{category}' event in {location}.

    Guidelines:
    - Keep it concise (4-7 words)
    - Make it memorable and distinctive
    - Capture the essence of the category
    - Use creative language that evokes emotion
    - No generic placeholders or templates

    The event could be similar to one of these event types:
    ['Live music', 'DJ performances', 'Dancing', 'Karaoke', 
    'Comedy shows', 'Trivia nights', 'Art exhibits', 'Film screenings', 
    'Wine tasting', 'Craft beer tasting', 'Mixology classes', 'Food festivals',
    'Sports viewing parties', 'Networking events', 'Tech conferences',
    'Cultural celebrations', 'Outdoor concerts', 'Poetry slams', 'Book launches',
    'Fitness challenges', 'Cooking workshops', 'Wellness retreats',
    'Photography exhibitions', 'Fashion shows', 'Gaming tournaments',
    'Charity fundraisers', 'Farmers markets', 'Craft fairs',
    'Food truck rallies', 'Seasonal celebrations', 'Street festivals',
    'Immersive experiences', 'Industry roundtables', 'Product launches',
    'Educational workshops', 'Community gatherings']

    Return ONLY the title without quotes, explanation, or any additional text.
    """
    
    # Generate title using LLM
    title = llm_response(title_prompt)
    
    # Clean up any quotation marks or extra formatting that might be returned
    title = title.strip().strip('"\'').strip()
    return title


# Function to generate event description and infer indoor/outdoor using GPT
def generate_llm_description(category, event_row, title):
    """Generate event description first, then infer event type (Indoor/Outdoor)."""
    city = event_row.get('city', 'Local')
    state = event_row.get('state', '')
    location = f"{city}, {state}" if state else city
    time_info = event_row.get('start_time', 'TBD')

    # Step 1: Generate event description using GPT
    description_prompt = f"""
    Generate a short and engaging 1-2 sentence event description based on the following details:

    Event Title: {title}
    Category: {category}
    Location: {location}
    Start Time: {time_info}

    The description should highlight the main features and excitement of the event.
    """

    generated_description = llm_response(description_prompt)


    # Step 2: Infer Indoor/Outdoor classification using GPT
    classification_prompt = f"""
    Based on the following event description, determine whether this event is 'Indoor' or 'Outdoor':

    "{generated_description}"

    Please return only 'Indoor' or 'Outdoor' as the response.
    """
    event_type = llm_response(classification_prompt)

    return event_type, generated_description



# Main function to synthesize event details
def synthesize_event_details(events_df):
    """
    Synthesize event titles, descriptions, and categories without knowing actual word stems
    
    Parameters:
    - events_df: DataFrame containing the events data
    
    Returns:
    - DataFrame with added columns: 'category', 'title', 'description', 'event_type'
    """
    
    # Create new columns
    events_df = events_df.copy()
    
    # Initialize new columns using loc
    events_df.loc[:, 'category'] = ''
    events_df.loc[:, 'title'] = ''
    events_df.loc[:, 'description'] = ''
    events_df.loc[:, 'event_type'] = ''
    
    # Process each event
    for idx, event in events_df.iterrows():
        # Determine category based on patterns
        category = determine_category(event)
        events_df.loc[idx, 'category'] = category
        
        # Generate title using LLM
        title = generate_llm_title(category, event)
        events_df.loc[idx, 'title'] = title
        
        # Generate description and determine event type using GPT
        event_type, description = generate_llm_description(category, event, title)
        events_df.loc[idx, 'event_type'] = event_type
        events_df.loc[idx, 'description'] = description
        
        # Progress update for large datasets
        if idx % 1000 == 0:
            print(f"Processed {idx} events")
    
    return events_df



# Example usage
# 1. Basic synthetic generation:
# Example usage
# 1. Basic synthetic generation:
# events_df = load_events_data('events.csv')
# enriched_events = synthesize_event_details(events_df.iloc[:10,:])
# enriched_events = add_variety(enriched_events)
# enriched_events.to_csv('enriched_events.csv', index=False)


In [4]:
events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/filtered_events_data.csv")

In [None]:
# Example usage
# 1. Basic synthetic generation:
#events_df = load_events_data('events.csv')
enriched_events = synthesize_event_details(events_df.iloc[:5000,:])
#enriched_events = add_variety(enriched_events)
# enriched_events.to_csv('enriched_events.csv', index=False)

In [48]:
enriched_events.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,event_id,start_time,city,lat,lng,c_1,c_2,c_3,...,c_96,c_97,c_98,c_99,c_100,c_other,category,title,description,event_type
0,40,40,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,0,3,1,...,0,0,0,0,0,28,Sports & Fitness,Sweat & Sizzle Sports Fiesta,"""Get ready to sweat and sizzle with the ultima...",Outdoor
1,51,51,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,5,3,3,...,1,0,0,0,0,103,Business & Networking,Connect. Collaborate. Thrive. Palo Alto.,"""Join us in Palo Alto on July 8th as we bring ...",Indoor
2,74,74,920600431,2012-07-29T19:00:00.000Z,Karachi,24.893,67.028,5,3,3,...,1,0,0,0,0,103,Seasonal & Festivals,Karachi Beats: Sonic Fiesta,"""Get ready to groove with Karachi Beats: Sonic...",'Outdoor'
3,155,156,3580637647,2012-10-22T10:00:00.003Z,Los Angeles,3.156,101.612,0,0,0,...,0,0,0,0,0,6,Business & Networking,ConnectLA: Elevate Your Network,Here's a short and engaging event description:...,Indoor
4,177,178,1924180022,2012-11-11T20:00:00.003Z,Palo Alto,37.416,-122.152,0,0,0,...,0,0,0,0,0,0,Seasonal & Festivals,Palo Alto Harvest Hoedown,"""Get ready for a foot-stompin' good time at th...",Outdoor


In [50]:
enriched_events.title

0                Sweat & Sizzle Sports Fiesta
1    Connect. Collaborate. Thrive. Palo Alto.
2                 Karachi Beats: Sonic Fiesta
3             ConnectLA: Elevate Your Network
4                   Palo Alto Harvest Hoedown
Name: title, dtype: object

In [6]:
events_df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,event_id,start_time,city,lat,lng,c_1,c_2,c_3,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,40,40,2587616435,2012-11-13T11:00:00.002Z,Sihanoukville,10.633,103.5,0,3,1,...,0,0,0,0,0,0,0,0,0,28
1,51,51,1145166049,2013-07-08T02:00:00.000Z,Palo Alto,37.442,-122.172,5,3,3,...,0,0,0,0,1,0,0,0,0,103
