In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("events_data.csv")

In [7]:
# 1. Get total number of rows
total_rows = len(df)

# 2. Count how many rows have "No organizations found" in Host organizations
rows_with_no_orgs = df['First Host Org'].eq("No organizations found").sum()

# 3. Subtract from the total to get rows that have actual organizations
rows_with_orgs = total_rows - rows_with_no_orgs

print("Total rows:", total_rows)
print("Rows with 'No organizations found':", rows_with_no_orgs)
print("Rows that have actual organizations:", rows_with_orgs)

Total rows: 2467
Rows with 'No organizations found': 977
Rows that have actual organizations: 1490


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2467 entries, 0 to 2466
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Event Name          2467 non-null   object
 1   Event Time          2467 non-null   object
 2   Location            2467 non-null   object
 3   Club                2467 non-null   object
 4   Image URL           2105 non-null   object
 5   Event URL           2467 non-null   object
 6   Description         2467 non-null   object
 7   Host Organizations  2467 non-null   object
 8   First Host Org      2467 non-null   object
 9   Perks               2467 non-null   object
 10  Categories          2467 non-null   object
 11  Start Time          2467 non-null   object
 12  End Time            2467 non-null   object
dtypes: object(13)
memory usage: 250.7+ KB


In [None]:
import pandas as pd
import re

# Read data
df = pd.read_csv("events_data.csv")

# Clean data
df = df[df["Club"] != "Unknown Club"]
df = df.loc[df['Description'] != 'No description available']
df.loc[df['Host Organizations'] == 'No organizations found', 'Host Organizations'] = df['Club']
df.loc[df['First Host Org'] == 'No organizations found', 'First Host Org'] = df['Club']
df.loc[df['Categories'] == 'No categories found', 'Categories'] = pd.NA

# Standardize Start Time formatting
df['Start Time'] = df['Start Time'].str.replace(r"PMESTto|PMEDTto", "PM EST", regex=True)
df['Start Time'] = df['Start Time'].str.replace(r"AMESTto|AMEDTto", "AM EST", regex=True)
df['Start Time'] = df['Start Time'].str.replace(r"at ", "", regex=True)
df['Start Time'] = pd.to_datetime(df['Start Time'], format='%A, %B %d %Y %I:%M %p %Z')

# Standardize End Time formatting
df['End Time'] = df['End Time'].str.replace(r"PMEST|PMEDT", "PM EST", regex=True)
df['End Time'] = df['End Time'].str.replace(r"AMEST|AMEDT", "AM EST", regex=True)
df['End Time'] = df['End Time'].str.replace(r"at ", "", regex=True)
df['End Time'] = pd.to_datetime(df['End Time'], format='%A, %B %d %Y %I:%M %p %Z')

# Apply regex-based text formatting to the 'details' column
def clean_text(text):
    if pd.isna(text):
        return text
    text = text.strip()  # Remove leading/trailing spaces
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
    text = re.sub(r"\s*([.,!?])\s*", r"\1 ", text)  # Ensure spacing around punctuation
    text = re.sub(r"\s*-\s*", " - ", text)  # Normalize dashes
    text = re.sub(r"\s*:\s*", ": ", text)  # Ensure spacing around colons
    text = re.sub(r"(\.\s*){2,}", ". ", text)  # Fix multiple dots
    text = re.sub(r"(\s*\n\s*)+", "\n", text)  # Remove unnecessary newlines
    return text.strip()

df['Description'] = df['Description'].apply(clean_text)  # Apply regex-based cleaning

# Drop columns and rename
df = df.drop(["Club", "Event Time"], axis=1)

df.rename(columns={
    'Event Name': 'title',
    'Location': 'location',
    'Image URL': 'image',
    'Event URL': 'link',
    'Description': 'details',
    'Host Organizations': 'hosts',
    'First Host Org': 'main_host',
    'Categories': 'categories',
    'Start Time': 'start',
    'End Time': 'end'
}, inplace=True)

# Convert to Unix timestamps (seconds since epoch)
df['start'] = df['start'].astype('int64') // 10**9
df['end'] = df['end'].astype('int64') // 10**9

# Save cleaned data to CSV
df.to_csv("events.csv", index=False)

# Display first 10 rows
df.head(10)


Unnamed: 0,title,location,image,link,details,hosts,main_host,Perks,categories,start,end
0,Table Tennis Ping Pong Practice Meeting,MAC Courts in Campus Rec,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,4 meetings per week of general ping pong pract...,Table Tennis Club,Table Tennis Club,Credit,Sporting/Athletic Program,1740259800,1740265200
1,CASB: China Night Audition,Union 203-11,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,💗 Get ready because ✨China Night ✨will be on S...,Chinese Association at Stony Brook,Chinese Association at Stony Brook,Credit,,1740240000,1740265200
3,Ramen & Lego,Union Ballroom,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,Join Student Engagement and Activities forRame...,Student Engagement & Activities,Student Engagement & Activities,"Free Food, Free Stuff, Credit","Arts & Crafts, Movies & Games",1740265200,1740272400
5,Wanna Canta 2025,Macmahon Student Center: Duncan Sky Room 6th F...,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,Annual Singing Competition hosted by the Saint...,Philippine United Student Organization,Philippine United Student Organization,Credit,,1740265200,1740276000
6,Smash Club Game Night,SAC 306,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,Come join the Smash club for a game night!,Smash Bros. at SBU,Smash Bros. at SBU,Credit,,1740265200,1740279600
7,HopperHacks X,SAC Ballroom A,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,"HopperHacksis a day-long, social good-focused ...",Women in Computer Science,Women in Computer Science,"Free Food, Free Stuff, Credit","Career/Networking, Movies & Games",1740240000,1740279600
8,Ice Hockey: Home Game vs. Drexel,The Rinx,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,The Ice Hockey club will host the Drexel Dragons.,Ice Hockey,Ice Hockey,Credit,Sporting/Athletic Program,1740274200,1740285000
10,Binghamton University Open Fencing Tournament,Binghamton University,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,The Fencing Club will be heading to Binghamton...,Fencing Club,Fencing Club,Credit,Sporting/Athletic Program,1740157200,1740333600
11,Fordham Dual,Fordham Prep,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,"Fordham Prep Gym (441 E Fordham Rd, Bronx, NY ...",Wrestling Team,Wrestling Team,Credit,,1740330000,1740337200
12,Table Tennis Ping Pong Practice Meeting,MAC Courts in Campus Rec,https://se-images.campuslabs.com/clink/images/...,https://stonybrook.campuslabs.com/engage/event...,4 meetings per week of general ping pong pract...,Table Tennis Club,Table Tennis Club,Credit,Sporting/Athletic Program,1740335400,1740340800


In [23]:
df1 = pd.read_csv("data_cleaned.csv")

In [24]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1375 entries, 0 to 1374
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Event Name          1375 non-null   object 
 1   Event Time          1375 non-null   object 
 2   Location            1375 non-null   object 
 3   Club                1375 non-null   object 
 4   Image URL           1214 non-null   object 
 5   Event URL           1375 non-null   object 
 6   Description         1375 non-null   object 
 7   Host Organizations  1375 non-null   object 
 8   First Host Org      1375 non-null   object 
 9   Perks               1375 non-null   object 
 10  Categories          548 non-null    object 
 11  Start Time          0 non-null      float64
 12  End Time            1375 non-null   object 
dtypes: float64(1), object(12)
memory usage: 139.8+ KB
