## Data sourcing

Source data from various source systems and ingest them using python code.

1. Parquet files
2. CSV files
3. APIs
4. RDBMS databases
5. HTML

In [None]:
# import modules
import certifi
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import urllib3
from urllib3 import request
from unicodedata import normalize

### Sourcing Parquet data

Please visit the url https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [None]:
# Read data from the Parquet file. We use pandas read_parquet method for ease and speed.
df_parquet = pd.read_parquet("data/yellow_tripdata_2022-01.parquet")
df_parquet.head()

### Sourcing CSV data 

Please visit the url https://data.cityofnewyork.us/resource/h9gi-nx95.csv?$limit=500


In [None]:
# Read data from the CSV file. We use pandas read_csv method for ease and speed.
df_csv = pd.read_csv("data/h9gi-nx95.csv")
df_csv.head()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,cross_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,2022-07-20T00:00:00.000,1:25,,,40.835808,-73.89083,"\n, \n(40.835808, -73.89083)",BOSTON ROAD,,,...,Unspecified,,,,4547589,Sedan,Sedan,,,
1,2022-07-21T00:00:00.000,5:20,,,,,,FDR DRIVE,,,...,Unspecified,,,,4548075,Sedan,Sedan,,,
2,2021-04-14T00:00:00.000,5:32,,,,,,BRONX WHITESTONE BRIDGE,,,...,Unspecified,,,,4407480,Sedan,Sedan,,,
3,2021-04-13T00:00:00.000,21:35,BROOKLYN,11217.0,40.68358,-73.97617,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,...,,,,,4407147,Sedan,,,,
4,2021-04-15T00:00:00.000,16:15,,,,,,HUTCHINSON RIVER PARKWAY,,,...,,,,,4407665,Station Wagon/Sport Utility Vehicle,,,,


### Sourcing data from APIs

Please make sure to install the certifi library using - pipenv install certifi

In [None]:
# get api data from url
url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500'

# Check if API is available to retrive the data
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
apt_status = http.request('GET', url).status
print(apt_status)
if apt_status == 200:
    # Sometimes we get certificate error . We shoul never silence this error as this may cause a securirty threat.
    # Create a Pool manager that can be used to read the API response 
    data = json.loads(http.request('GET', url).data.decode('utf-8'))
    df_api = pd.json_normalize(data)
else:
    df_api = pd.Dataframe()
df_api.head(10)

### Sourcing Data from RDBMS tables

In [None]:
# Read sqlite query results into a pandas DataFrame
with sqlite3.connect("data/movies.sqlite") as conn:
    df = pd.read_sql("SELECT * from movies", conn)
df.head()

# Sourcing data from Webpages

Please visit the url https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)

In [None]:
# get data from url
df_html = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)',match = 'by country')
# Let's see how many tables are there with tage ' by county'
print(len(df_html)) # There are 4 tables
# Let's see the first table
df_html[0]

4


Unnamed: 0_level_0,Country/Territory,IMF[1][12],IMF[1][12],World Bank[13],World Bank[13],United Nations[14],United Nations[14]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,105435540,2023,100834796,2022
1,United States,30507217,2025,27360935,2023,25744100,2022
2,China,19231705,[n 1]2025,17794782,[n 3]2023,17963170,[n 1]2022
3,Germany,4744804,2025,4456081,2023,4076923,2022
4,India,4187017,2025,3549919,2023,3465541,2022
...,...,...,...,...,...,...,...
205,Palau,333,2025,263,2023,225,2022
206,Kiribati,312,2025,279,2023,223,2022
207,Marshall Islands,297,2025,284,2023,279,2022
208,Nauru,169,2025,154,2023,147,2022
