In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("events_data.csv")

In [7]:
# 1. Get total number of rows
total_rows = len(df)

# 2. Count how many rows have "No organizations found" in Host organizations
rows_with_no_orgs = df['First Host Org'].eq("No organizations found").sum()

# 3. Subtract from the total to get rows that have actual organizations
rows_with_orgs = total_rows - rows_with_no_orgs

print("Total rows:", total_rows)
print("Rows with 'No organizations found':", rows_with_no_orgs)
print("Rows that have actual organizations:", rows_with_orgs)

Total rows: 2467
Rows with 'No organizations found': 977
Rows that have actual organizations: 1490


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2467 entries, 0 to 2466
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Event Name          2467 non-null   object
 1   Event Time          2467 non-null   object
 2   Location            2467 non-null   object
 3   Club                2467 non-null   object
 4   Image URL           2105 non-null   object
 5   Event URL           2467 non-null   object
 6   Description         2467 non-null   object
 7   Host Organizations  2467 non-null   object
 8   First Host Org      2467 non-null   object
 9   Perks               2467 non-null   object
 10  Categories          2467 non-null   object
 11  Start Time          2467 non-null   object
 12  End Time            2467 non-null   object
dtypes: object(13)
memory usage: 250.7+ KB


In [None]:
import pandas as pd
import re

# Load your data file (adjust filename/path as needed)
df = pd.read_csv("events_data.csv")

# 1. Remove unwanted rows
df = df[df["Club"] != "Unknown Club"]
df = df[df["Description"] != "No description available"]
df = df[df["Host Organizations"] != "No organizations found"]

# 2. Replace categories text
df["Categories"] = df["Categories"].replace("No categories found", "N/A")

# 3. Define a regex-based datetime parser with default year 2025
def parse_datetime_regex(time_str, default_year="2025"):
    """
    Parses a datetime string using regex to extract its components.
    If a year is missing, the default_year (2025) is inserted.
    Returns a pandas.Timestamp.
    """
    time_str = time_str.strip()
    # Regex pattern captures weekday, month, day, optional year, time, and timezone.
    regex = (r"^(?P<weekday>\w+),\s+"
             r"(?P<month>\w+)\s+"
             r"(?P<day>\d{1,2})"
             r"(?:\s+(?P<year>\d{4}))?\s*"
             r"(?:at\s+)?"
             r"(?P<hour>\d{1,2}):(?P<minute>\d{2})\s*"
             r"(?P<ampm>[AP]M)\s+"
             r"(?P<tz>[A-Z]+)"
             r"(?:\s+(?P<year2>\d{4}))?$")
    match = re.match(regex, time_str)
    if match:
        groups = match.groupdict()
        # Use the provided year if available; otherwise, default to 2025.
        year = groups["year"] if groups["year"] is not None else (groups["year2"] if groups["year2"] is not None else default_year)
        new_str = f"{groups['weekday']}, {groups['month']} {groups['day']} {year} {groups['hour']}:{groups['minute']}{groups['ampm']} {groups['tz']}"
        try:
            dt = pd.to_datetime(new_str, format="%A, %B %d %Y %I:%M%p %Z")
            return dt
        except Exception as e:
            # If the fixed format fails, fallback to generic parsing on the constructed string.
            return pd.to_datetime(new_str)
    else:
        # Fallback: If regex doesn't match, append the default year (if not already present)
        if default_year not in time_str:
            fallback_str = time_str + " " + default_year
        else:
            fallback_str = time_str
        try:
            return pd.to_datetime(fallback_str)
        except Exception as e:
            return pd.NaT

# 4. Define functions for Start Time and End Time that also check for "Unknown" values
def parse_start_time(time_str, default_year="2025"):
    if time_str.strip() == "Unknown Start Time":
        return "N/A"
    return parse_datetime_regex(time_str, default_year)

def parse_end_time(time_str, default_year="2025"):
    if time_str.strip() == "Unknown End Time":
        return "N/A"
    return parse_datetime_regex(time_str, default_year)

# 5. Apply the parsing functions to the relevant columns
df["Event Time"] = df["Event Time"].apply(parse_datetime_regex)
df["Start Time"] = df["Start Time"].apply(parse_start_time)
df["End Time"] = df["End Time"].apply(parse_end_time)

# (Optional) Save the cleaned DataFrame to a new CSV file
df.to_csv("data_cleaned.csv", index=False)

print("Data cleaning complete. Cleaned data has been saved to 'data_cleaned.csv'.")


  return pd.to_datetime(time_str)


OutOfBoundsDatetime: Out of bounds nanosecond timestamp: Sunday, March 9 at 3:00PM EDT, at position 0

In [18]:
df1 = pd.read_csv("data_cleaned.csv")

In [19]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1375 entries, 0 to 1374
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Event Name          1375 non-null   object 
 1   Event Time          420 non-null    object 
 2   Location            1375 non-null   object 
 3   Club                1375 non-null   object 
 4   Image URL           1214 non-null   object 
 5   Event URL           1375 non-null   object 
 6   Description         1375 non-null   object 
 7   Host Organizations  1375 non-null   object 
 8   First Host Org      1375 non-null   object 
 9   Perks               1375 non-null   object 
 10  Categories          548 non-null    object 
 11  Start Time          0 non-null      float64
 12  End Time            1375 non-null   object 
dtypes: float64(1), object(12)
memory usage: 139.8+ KB
