In [1]:
import pandas as pd
import re

# Load the City Events Calendar CSV file
df_events = pd.read_csv('events_calendar.csv')

# Display basic information for initial inspection
print("Initial events calendar shape:", df_events.shape)
print(df_events.head())
print(df_events.info())


Initial events calendar shape: (50000, 5)
   event_id     name                 date location      type
0         1  Event 1  2023-01-01 00:00:00  Stadium  festival
1         2  Event 2  2023-01-01 01:00:00  Stadium    sports
2         3  Event 3  2023-01-01 02:00:00     Park  festival
3         4  Event 4  2023-01-01 03:00:00     Park   concert
4         5  Event 5  2023-01-01 04:00:00  Stadium    sports
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   event_id  50000 non-null  int64 
 1   name      50000 non-null  object
 2   date      50000 non-null  object
 3   location  50000 non-null  object
 4   type      50000 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.9+ MB
None


In [2]:

# Remove duplicate rows to ensure uniqueness
df_events.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df_events.shape)

# Remove rows where the event title is missing
# Adjust the column name based on your CSV (e.g., 'event_title' or 'title')
if 'event_title' in df_events.columns:
    df_events = df_events[df_events['event_title'].notnull()]
elif 'title' in df_events.columns:
    df_events = df_events[df_events['title'].notnull()]
print("Shape after dropping missing titles:", df_events.shape)



Shape after removing duplicates: (50000, 5)
Shape after dropping missing titles: (50000, 5)


In [3]:
# Convert the event date column to a datetime object
# Try a few possible column names commonly used for dates
date_column = None
for col in ['date', 'event_date', 'timestamp']:
    if col in df_events.columns:
        date_column = col
        df_events[col] = pd.to_datetime(df_events[col], errors='coerce')
        break

if date_column:
    df_events = df_events[df_events[date_column].notnull()]
    print(f"Shape after cleaning the date column '{date_column}':", df_events.shape)

# Determine which column to filter on for relevance.
# We'll check the 'event_title' column first; if not available, fall back to 'title' or 'description'
if 'event_title' in df_events.columns:
    df_events = df_events[df_events['event_title'].apply(is_relevant_event)]
elif 'title' in df_events.columns:
    df_events = df_events[df_events['title'].apply(is_relevant_event)]
elif 'description' in df_events.columns:
    df_events = df_events[df_events['description'].apply(is_relevant_event)]

print("Shape after filtering out irrelevant events:", df_events.shape)




Shape after cleaning the date column 'date': (50000, 5)
Shape after filtering out irrelevant events: (50000, 5)


In [4]:
# Optional: Clean text fields (e.g., event title and description) for uniformity.
def clean_event_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text.strip().lower()

if 'event_title' in df_events.columns:
    df_events['event_title_clean'] = df_events['event_title'].apply(clean_event_text)

if 'description' in df_events.columns:
    df_events['description_clean'] = df_events['description'].apply(clean_event_text)


In [6]:

# Save the cleaned DataFrame to a new CSV file
output_file = 'events_calendar_cleaned.csv'
df_events.to_csv(output_file, index=False)
print(f"Cleaned City Events Calendar saved successfully at {output_file}.")

Cleaned City Events Calendar saved successfully at events_calendar_cleaned.csv.
