In [23]:
import json
import requests       # HTTP library for Python :contentReference[oaicite:4]{index=4}
from bs4 import BeautifulSoup  # HTML/XML parser :contentReference[oaicite:5]{index=5}
import pandas as pd   # Data handling and export :contentReference[oaicite:6]{index=6}
import time
import os

In [13]:
# Constants
BASE_URL = "https://community.ucla.edu/studentorgs"
RAW_JSON_PATH = "/Users/davidliu/Desktop/STATS418/ucla-student-orgs-project/data/raw/ucla_orgs_raw.json"

print("Libraries loaded. Ready to scrape.")

Libraries loaded. Ready to scrape.


In [3]:
# 3.1 Make GET request
response = requests.get(BASE_URL)
response.raise_for_status()  # ensure we notice bad responses :contentReference[oaicite:7]{index=7}

# 3.2 Parse HTML
soup = BeautifulSoup(response.text, "html.parser")
print("Page fetched and parsed.")

Page fetched and parsed.


In [6]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0, viewport-fit=cover" name="viewport"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<link href="/static/favicon.png" rel="icon" type="image/png"/>
<title>Student Organizations - UCLA Community</title>
<meta content="Student Organizations" property="og:title">
<link href="/static/style.min.css?957fb8930f8f1374ae1289d73923fbab" media="all" rel="stylesheet"/>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-125018472-3"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());
  gtag('config', 'UA-125018472-3');
</script>
</meta></head>
<body class="no-js" data-path="/studentorgs">
<script>document.body.classList.remove('no-js'); document.body.classList.add('has-js'); setTimeout(function() {document.body.classList.add('js-loaded')}, 500); try{if(Intl.DateTimeFormat()

In [4]:
# 4.1 Extract category links from the main page
category_tags = soup.select("ul.buttons li a")  # CSS selector for category links :contentReference[oaicite:5]{index=5}
categories = []
for a in category_tags:
    name = a.get_text(strip=True)
    href = a["href"]
    # Build full URL for the category page
    url = requests.compat.urljoin(BASE_URL, href)  # safe URL join :contentReference[oaicite:6]{index=6}
    categories.append({"name": name, "url": url})

print(f"Found {len(categories)} categories.")


Found 48 categories.


In [6]:
# 4.2 Iterate through each category and scrape organizations
records = []
for cat in categories:
    resp = requests.get(cat["url"])         # GET request for category page :contentReference[oaicite:7]{index=7}
    resp.raise_for_status()
    cat_soup = BeautifulSoup(resp.text, "html.parser")  # Parse HTML :contentReference[oaicite:8]{index=8}

    # Each org is in an <li> with an <a> child
    org_items = cat_soup.select("main .container ul > li")  # adjust selector if needed :contentReference[oaicite:9]{index=9}
    for li in org_items:
        link = li.select_one("a")
        if not link:
            continue

        # Split link text into name and description
        full_text = link.get_text(strip=True)
        # Assume the first sentence break ('. ') separates name vs. description
        parts = full_text.split(". ", 1)
        name = parts[0].strip()
        desc = parts[1].strip() if len(parts) > 1 else ""

        records.append({
            "category": cat["name"],
            "name": name,
            "description": desc
        })

    time.sleep(1)  # polite delay between requests :contentReference[oaicite:10]{index=10}

print(f"Total records collected: {len(records)}")


Total records collected: 2832


In [15]:
# 4.2 Adjusted: Iterate through each category and scrape organizations with correct selectors
import time

records = []
for cat in categories:
    resp = requests.get(cat["url"])
    resp.raise_for_status()
    cat_soup = BeautifulSoup(resp.text, "html.parser")

    # Select each card element (the org “tile”)
    org_cards = cat_soup.select("main .container ul > li .card")  # adjust to match actual wrapper class
    for card in org_cards:
        # Within the card, card-content holds name & description
        content = card.select_one("span.card-content")
        if not content:
            continue

        # Name in <strong class="h4">
        name_tag = content.select_one("strong.h4")
        name = name_tag.get_text(strip=True) if name_tag else None

        # Description in <span class="description">
        desc_tag = content.select_one("span.description")
        desc = desc_tag.get_text(strip=True) if desc_tag else ""

        records.append({
            "category": cat["name"],
            "name": name,
            "description": desc
        })

    time.sleep(1)  # polite delay

print(f"Total records collected: {len(records)}")


Total records collected: 2829


In [19]:
# 4.2 Revised: Scrape each card and then fetch its detail page for the full description

import time
from urllib.parse import urljoin

records = []
for cat in categories:
    # 1) GET the category page
    resp = requests.get(cat["url"])
    resp.raise_for_status()
    cat_soup = BeautifulSoup(resp.text, "html.parser")
    
    # 2) Select each card link
    card_links = cat_soup.select("main .container ul > li a.card")
    for link in card_links:
        # Basic metadata
        name = link.select_one("strong.h4").get_text(strip=True)
        short_desc = link.select_one("span.description").get_text(strip=True)
        
        # 3) Follow the href to get the full page
        detail_url = urljoin(BASE_URL, link["href"])
        detail_resp = requests.get(detail_url)
        detail_resp.raise_for_status()
        detail_soup = BeautifulSoup(detail_resp.text, "html.parser")
        
        # 4) Extract the full description if available
        full_desc_tag = detail_soup.select_one("p.org-description")
        full_desc = full_desc_tag.get_text(strip=True) if full_desc_tag else short_desc
        
        records.append({
            "category":   cat["name"],
            "name":       name,
            "description": full_desc,
            "detail_url": detail_url
        })
    
    time.sleep(1)   # be polite to the server

print(f"Total records collected with full descriptions: {len(records)}")


Total records collected with full descriptions: 2829


In [20]:
# 4.3 Build DataFrame & save to JSON
df_raw = pd.DataFrame(records)                   # Construct DataFrame :contentReference[oaicite:11]{index=11}
df_raw

Unnamed: 0,category,name,description,detail_url
0,Academic,"Academic, Mentorship, and Professional Develop...",AMPD provides MSA UCLA members services and re...,https://community.ucla.edu/studentorg/3176
1,Academic,Academy Health Student Chapter at UCLA,In partnership with the health services resear...,https://community.ucla.edu/studentorg/997
2,Academic,AI Robotics Ethics Society,"As technology advances at an exponential rate,...",https://community.ucla.edu/studentorg/3936
3,Academic,al-Arabiyya,"At al-Arabiyya, we believe in the infinite cre...",https://community.ucla.edu/studentorg/6350
4,Academic,Aleph: Undergraduate Research Journal for the ...,Aleph (pronounced “ah-lef”) is UCLA’s only off...,https://community.ucla.edu/studentorg/3145
...,...,...,...,...
2824,Transfer Students,Non-Traditional Student Network,The Network is a support system for students r...,https://community.ucla.edu/studentorg/5585
2825,Transfer Students,Pilipino Transfer Student Partnership,Our mission is to raise awareness about cultur...,https://community.ucla.edu/studentorg/60
2826,Transfer Students,Student Transfer Outreach and Mentor Program,Student Transfer Outreach and Mentor Program (...,https://community.ucla.edu/studentorg/30
2827,Transfer Students,Transfer Research Connection,The Transfer Research Connection at UCLA is a ...,https://community.ucla.edu/studentorg/6315


In [22]:
df_raw.to_csv('df_raw.csv', index=False)  # Save to CSV :contentReference[oaicite:12]{index=12}

In [25]:
df_raw.isna().sum()  # Check for missing values :contentReference[oaicite:12]{index=12}

category       0
name           0
description    0
detail_url     0
dtype: int64

In [26]:
# Export as JSON lines for easy downstream loading
json_str = df_raw.to_json(orient="records", lines=True)  # JSON lines output :contentReference[oaicite:12]{index=12}

with open(RAW_JSON_PATH, "w") as f:              # Write to file (built-in open) :contentReference[oaicite:13]{index=13}
    f.write(json_str)

print(f"Raw JSON written to {RAW_JSON_PATH}")

Raw JSON written to /Users/davidliu/Desktop/STATS418/ucla-student-orgs-project/data/raw/ucla_orgs_raw.json


In [27]:
# 5.2 Paths
RAW_PATH       = "data/raw/ucla_orgs_raw.json"
CLEANED_CSV    = "data/processed/ucla_orgs_cleaned.csv"

# 5.3 Ensure output folder exists
os.makedirs(os.path.dirname(CLEANED_CSV), exist_ok=True)

# 5.4 Load raw JSON lines
df = pd.read_json(RAW_PATH, lines=True)
print("Raw shape:", df.shape)
df.head()

Raw shape: (2829, 4)


Unnamed: 0,category,name,description,detail_url
0,Academic,"Academic, Mentorship, and Professional Develop...",AMPD provides MSA UCLA members services and re...,https://community.ucla.edu/studentorg/3176
1,Academic,Academy Health Student Chapter at UCLA,In partnership with the health services resear...,https://community.ucla.edu/studentorg/997
2,Academic,AI Robotics Ethics Society,"As technology advances at an exponential rate,...",https://community.ucla.edu/studentorg/3936
3,Academic,al-Arabiyya,"At al-Arabiyya, we believe in the infinite cre...",https://community.ucla.edu/studentorg/6350
4,Academic,Aleph: Undergraduate Research Journal for the ...,Aleph (pronounced “ah-lef”) is UCLA’s only off...,https://community.ucla.edu/studentorg/3145


In [29]:
# 5.5 Inspect missingness
print(df.isna().sum())

# 5.6 Fill or drop missing descriptions
# If description is NaN or empty, fill with placeholder
df["description"] = df["description"].fillna("").str.strip()

# 5.7 Normalize text
# - Category: lowercase, strip whitespace
# - Name: strip extra whitespace
df["category"] = df["category"].str.lower().str.strip()
df["name"]     = df["name"].str.strip()

# 5.8 Drop exact duplicates (by name & category)
before = df.shape[0]
df = df.drop_duplicates(subset=["name","category"])
after  = df.shape[0]
print(f"Dropped {before-after} duplicate rows")

category       0
name           0
description    0
detail_url     0
dtype: int64
Dropped 0 duplicate rows


In [31]:
# 5.10 Save cleaned data
df.to_csv(CLEANED_CSV, index=False)
print(f"Clean data written to {CLEANED_CSV}, shape={df.shape}")

Clean data written to data/processed/ucla_orgs_cleaned.csv, shape=(2829, 4)
