# Data Pull Preview

This notebook fetches NYC 311 data and provides an initial preview of the dataset.

In [1]:
import subprocess
import sys
import pandas as pd

# Run fetch script for 30 days
print("Fetching NYC 311 data (last 30 days)...")
result = subprocess.run(
    [sys.executable, "scripts/fetch_311.py", "--days", "30"],
    capture_output=True,
    text=True
)
print(result.stdout)
if result.returncode != 0:
    print(f"Error: {result.stderr}")

Fetching NYC 311 data (last 30 days)...

Error: /home/codespace/.python/current/bin/python: can't open file '/workspaces/nyc-311-ops-analysis/notebooks/scripts/fetch_311.py': [Errno 2] No such file or directory



In [3]:
# Load CSV data
df = pd.read_csv("/workspaces/nyc-311-ops-analysis/data/raw/311.csv")
print(f"Loaded {len(df):,} rows and {len(df.columns)} columns")

Loaded 110,045 rows and 12 columns


## Schema Information

In [4]:
# Display data types and schema
print("Data Types:")
print(df.dtypes)
print("\n" + "="*50)
df.info()

Data Types:
unique_key          int64
created_date       object
closed_date        object
agency             object
complaint_type     object
descriptor         object
status             object
borough            object
incident_zip      float64
city               object
latitude          float64
longitude         float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110045 entries, 0 to 110044
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   unique_key      110045 non-null  int64  
 1   created_date    110045 non-null  object 
 2   closed_date     86213 non-null   object 
 3   agency          110045 non-null  object 
 4   complaint_type  110045 non-null  object 
 5   descriptor      108716 non-null  object 
 6   status          110045 non-null  object 
 7   borough         110045 non-null  object 
 8   incident_zip    109124 non-null  float64
 9   city            106237 non-null  object

## Missing Values Analysis

In [5]:
# Calculate missing percentage per column
missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing Count': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df) * 100).round(2)
})
missing_summary = missing_summary.sort_values('Missing %', ascending=False)
print(missing_summary.to_string(index=False))

        Column  Missing Count  Missing %
   closed_date          23832      21.66
          city           3808       3.46
    descriptor           1329       1.21
      latitude           1281       1.16
     longitude           1281       1.16
  incident_zip            921       0.84
    unique_key              0       0.00
  created_date              0       0.00
        agency              0       0.00
       borough              0       0.00
complaint_type              0       0.00
        status              0       0.00


## Sample Data

In [6]:
# Display first few rows
df.head(10)

Unnamed: 0,unique_key,created_date,closed_date,agency,complaint_type,descriptor,status,borough,incident_zip,city,latitude,longitude
0,67225897,2025-12-18T10:38:00.000,2025-12-18T13:35:00.000,DEP,Hazardous Materials,Chemical Odor (HD1),Closed,MANHATTAN,10036.0,MANHATTAN,40.761607,-73.990315
1,67225898,2025-12-18T14:29:00.000,2025-12-22T13:00:00.000,DEP,Water Quality,"Taste/Odor, Chlorine (QA1)",Closed,MANHATTAN,10029.0,NEW YORK,40.791072,-73.943475
2,67225901,2025-12-18T11:29:00.000,,DEP,Noise,Noise: air condition/ventilation equipment (NV1),Open,MANHATTAN,10027.0,NEW YORK,40.810872,-73.952525
3,67225905,2025-12-18T10:50:00.000,2025-12-23T18:00:00.000,DEP,Noise,Noise: Alarms (NR3),Closed,BROOKLYN,11221.0,BROOKLYN,40.694265,-73.919742
4,67225907,2025-12-18T19:41:00.000,,DEP,Asbestos,Asbestos Complaint (B1),Open,BROOKLYN,11214.0,BROOKLYN,40.602414,-74.001732
5,67225908,2025-12-18T23:08:00.000,2025-12-19T08:00:00.000,DEP,Hazardous Materials,Chemical Odor (HD1),Closed,QUEENS,11101.0,QUEENS,40.753857,-73.917631
6,67225909,2025-12-18T20:05:00.000,,DEP,Lead,Lead Kit Request (Residential) (L10),Open,BROOKLYN,11228.0,BROOKLYN,40.614872,-74.007578
7,67225910,2025-12-18T18:08:00.000,2025-12-19T14:26:00.000,DEP,Lead,Lead Kit Request (Residential) (L10),Closed,BROOKLYN,11218.0,BROOKLYN,40.65034,-73.973981
8,67225911,2025-12-18T16:26:00.000,,DEP,Lead,Lead Kit Request (Residential) (L10),Open,QUEENS,11356.0,COLLEGE POINT,40.78872,-73.853358
9,67225912,2025-12-18T12:32:00.000,2025-12-19T14:28:00.000,DEP,Lead,Lead Kit Request (Residential) (L10),Closed,MANHATTAN,10024.0,NEW YORK,40.783482,-73.9791
