<a href="https://colab.research.google.com/github/FVargasData/wpgPermits/blob/DataCollectionTest/Datathon2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import necessary libraries

In [7]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import time
import logging

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Set seaborn pastel palette for visualizations

In [8]:
sns.set_palette("pastel")
sns.set_style("whitegrid")

# Set logging configuration

In [9]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configuration and API parameters

In [10]:
API_URL = "https://data.winnipeg.ca/resource/urbd-qygv.json"
APP_TOKEN = "CvU5WTCBwn36S6Vxebw1QjJVd"
CHUNK_SIZE = 1000  # number of rows per request (Socrata default)
MAX_ROWS = 330000  # maximum rows expected

# Columns to be retained based on our analysis

In [11]:
COLUMNS_TO_KEEP = [
    "issue_date", "permit_group", "permit_type", "sub_type", "work_type",
    "neighbourhood_name", "community", "ward", "application_received_date", "status",
    "x_coordinate_nad83", "y_coordinate_nad83"
]

#FUNCTIONS SECTION

## This function will Fetch all rows from the API using pagination. I will return a DataFrame containing all the data.

In [20]:
def fetch_data(api_url, app_token, chunk_size=1000, max_rows=330000):
    headers = {"X-App-Token": app_token}
    all_data = []
    offset = 0

    logging.info("Starting data collection from API...")
    while offset < max_rows:
        params = {
            "$limit": chunk_size,
            "$offset": offset
        }
        try:
            response = requests.get(api_url, headers=headers, params=params)
            response.raise_for_status()
            data_chunk = response.json()
            if not data_chunk:
                logging.info("No more data returned from API.")
                break
            all_data.extend(data_chunk)
            offset += chunk_size
            logging.info(f"Fetched {len(data_chunk)} rows; total so far: {len(all_data)}")
            # To avoid hitting rate limits
            time.sleep(0.2)
        except Exception as e:
            logging.error("Error during API request: " + str(e))
            break

    logging.info(f"Data collection complete. Total rows fetched: {len(all_data)}")
    return pd.DataFrame(all_data)

## Visualize funcitonality

### # 1. Data Collection

In [24]:
df_raw = fetch_data(API_URL, APP_TOKEN, chunk_size=CHUNK_SIZE, max_rows=1000)

if df_raw.empty:
  logging.error("No data fetched. Exiting.")