Data Extraction Using the Github API

Script includes extracting data from an API, preprocessing it and storing it in a DuckDB database for analytics

In [1]:
#Sending a get request

import requests
url = "https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/events"
response = requests.get(url)
data = response.json()
print(data)

[{'id': '49063897817', 'type': 'WatchEvent', 'actor': {'id': 73944641, 'login': 'yousamaa', 'display_login': 'yousamaa', 'gravatar_id': '', 'url': 'https://api.github.com/users/yousamaa', 'avatar_url': 'https://avatars.githubusercontent.com/u/73944641?'}, 'repo': {'id': 419661684, 'name': 'DataTalksClub/data-engineering-zoomcamp', 'url': 'https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp'}, 'payload': {'action': 'started'}, 'public': True, 'created_at': '2025-04-25T13:02:02Z', 'org': {'id': 72699292, 'login': 'DataTalksClub', 'gravatar_id': '', 'url': 'https://api.github.com/orgs/DataTalksClub', 'avatar_url': 'https://avatars.githubusercontent.com/u/72699292?'}}, {'id': '49055464165', 'type': 'WatchEvent', 'actor': {'id': 5618377, 'login': 'phiskus', 'display_login': 'phiskus', 'gravatar_id': '', 'url': 'https://api.github.com/users/phiskus', 'avatar_url': 'https://avatars.githubusercontent.com/u/5618377?'}, 'repo': {'id': 419661684, 'name': 'DataTalksClub/data-engin

In [None]:
#Streming request example
#import websocket

#def on_message(ws, message):
#    print("Received event:", message)

#ws = websocket.WebSocketApp("wss://api.example.com/stream", on_message=on_message)
#ws.run_forever()

In [67]:
#checking API rate limits
url = "https://api.github.com/rate_limit"
response = requests.get(url)
response.json()['rate']['remaining']

60

In [None]:
#using the rate limit
import time
url = "https://api.github.com/rate_limit"
response = requests.get(url)
remaining = response.json()['rate']['remaining']

if remaining == 0:
    reset_time = response.json()['rate']['reset']
    wait_time = reset_time - time.time()
    time.sleep(wait_time)
    response = requests.get(url)

In [None]:
#Working with Authentication
import requests
url = "https://api.github.com/user"
response = requests.get(url, auth=('username', 'password'))

#Alternatively
from google.colab import userdata
username = userdata.get('github_username')
password = userdata.get('github_password')
response = requests.get(url, auth=(username, password))

#API Token
API_token = userdata.get('githubtoken')
headers = {
    'Authorization': f'bearer{API_token}'
}
url = "https://api.github.com/user"
response = requests.get(url, headers=headers)

response.json()

In [None]:
#Pagination
#url = "https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/issues"
url = "https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/events"
response = requests.get(url)
response.headers['Link']

'<https://api.github.com/repositories/419661684/events?page=2>; rel="next", <https://api.github.com/repositories/419661684/events?page=10>; rel="last"'

In [None]:
response.links['next']['url']

'https://api.github.com/repositories/419661684/events?page=2'

In [2]:
url = "https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/events"
while True:
    response = requests.get(url)
    data = response.json()
    print(len(data))
    if 'next' not in response.links:
      break
    url = response.links['next']['url']
    print(url)

30
https://api.github.com/repositories/419661684/events?page=2
30
https://api.github.com/repositories/419661684/events?page=3
30
https://api.github.com/repositories/419661684/events?page=4
30
https://api.github.com/repositories/419661684/events?page=5
30
https://api.github.com/repositories/419661684/events?page=6
30
https://api.github.com/repositories/419661684/events?page=7
30
https://api.github.com/repositories/419661684/events?page=8
30
https://api.github.com/repositories/419661684/events?page=9
30
https://api.github.com/repositories/419661684/events?page=10
26


Storage

In [3]:
def events_getter():
  url = "https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/events"
  while True:
    response = requests.get(url)
    data = response.json()
    yield data
    if 'next' not in response.links:
      break
    url = response.links['next']['url']



In [4]:
events_pages = events_getter()

for event_page in events_pages:
  print(event_page)

[{'id': '49063897817', 'type': 'WatchEvent', 'actor': {'id': 73944641, 'login': 'yousamaa', 'display_login': 'yousamaa', 'gravatar_id': '', 'url': 'https://api.github.com/users/yousamaa', 'avatar_url': 'https://avatars.githubusercontent.com/u/73944641?'}, 'repo': {'id': 419661684, 'name': 'DataTalksClub/data-engineering-zoomcamp', 'url': 'https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp'}, 'payload': {'action': 'started'}, 'public': True, 'created_at': '2025-04-25T13:02:02Z', 'org': {'id': 72699292, 'login': 'DataTalksClub', 'gravatar_id': '', 'url': 'https://api.github.com/orgs/DataTalksClub', 'avatar_url': 'https://avatars.githubusercontent.com/u/72699292?'}}, {'id': '49055464165', 'type': 'WatchEvent', 'actor': {'id': 5618377, 'login': 'phiskus', 'display_login': 'phiskus', 'gravatar_id': '', 'url': 'https://api.github.com/users/phiskus', 'avatar_url': 'https://avatars.githubusercontent.com/u/5618377?'}, 'repo': {'id': 419661684, 'name': 'DataTalksClub/data-engin

In [5]:
event = event_page[0]
event

{'id': '48618902835',
 'type': 'ForkEvent',
 'actor': {'id': 14951461,
  'login': 'wojsamjan',
  'display_login': 'wojsamjan',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/wojsamjan',
  'avatar_url': 'https://avatars.githubusercontent.com/u/14951461?'},
 'repo': {'id': 419661684,
  'name': 'DataTalksClub/data-engineering-zoomcamp',
  'url': 'https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp'},
 'payload': {'forkee': {'id': 965303562,
   'node_id': 'R_kgDOOYldCg',
   'name': 'data-engineering-zoomcamp',
   'full_name': 'wojsamjan/data-engineering-zoomcamp',
   'private': False,
   'owner': {'login': 'wojsamjan',
    'id': 14951461,
    'node_id': 'MDQ6VXNlcjE0OTUxNDYx',
    'avatar_url': 'https://avatars.githubusercontent.com/u/14951461?v=4',
    'gravatar_id': '',
    'url': 'https://api.github.com/users/wojsamjan',
    'html_url': 'https://github.com/wojsamjan',
    'followers_url': 'https://api.github.com/users/wojsamjan/followers',
    'following_ur

In [30]:
def process_event(event):
  result = {}

  result['id'] = event['id']
  result['login'] = event['actor']['login']
  result['description'] = event['payload'].get('description')
  timestamp = datetime.fromisoformat(event['created_at'])
  result['timestamp'] = timestamp.timestamp()
  result['created_at'] = event['created_at']

  return result

In [31]:
#usage
processed_events = []

for event in event_page:
  processed_event = process_event(event)
  processed_events.append(processed_event)

processed_events

[{'id': '48618902835',
  'login': 'wojsamjan',
  'description': None,
  'timestamp': 1744493184.0,
  'created_at': '2025-04-12T21:26:24Z'},
 {'id': '48618110953',
  'login': 'jrgealexq',
  'description': None,
  'timestamp': 1744488099.0,
  'created_at': '2025-04-12T20:01:39Z'},
 {'id': '48617168749',
  'login': 'ovidiu-eremia',
  'description': None,
  'timestamp': 1744482585.0,
  'created_at': '2025-04-12T18:29:45Z'},
 {'id': '48617151331',
  'login': 'ggledis',
  'description': None,
  'timestamp': 1744482485.0,
  'created_at': '2025-04-12T18:28:05Z'},
 {'id': '48617150361',
  'login': 'ggledis',
  'description': None,
  'timestamp': 1744482479.0,
  'created_at': '2025-04-12T18:27:59Z'},
 {'id': '48616893745',
  'login': 'adaydan',
  'description': None,
  'timestamp': 1744481096.0,
  'created_at': '2025-04-12T18:04:56Z'},
 {'id': '48616840596',
  'login': 'akhileshcs005',
  'description': None,
  'timestamp': 1744480831.0,
  'created_at': '2025-04-12T18:00:31Z'},
 {'id': '486168322

In [27]:
from datetime import datetime

def process_event(event):
  result = {}

  result['id'] = event['id']
  result['login'] = event['actor']['login']
  result['description'] = event.get('actor', {}).get('payload', {}).get('description')
  result['created_at'] = datetime.strptime(event['created_at'], '%Y-%m-%dT%H:%M:%SZ')
  timestamp = datetime.fromisoformat(event['created_at'])
  result['timestamp'] = timestamp.timestamp()

  return result

process_event(event)

{'id': '48606658043',
 'login': 'poiu72',
 'description': None,
 'created_at': datetime.datetime(2025, 4, 12, 1, 50, 31),
 'timestamp': 1744422631.0}

Loading

In [28]:
#DataBase
import duckdb

conn = duckdb.connect('events_storage')

In [32]:
processed_events[0]

{'id': '48618902835',
 'login': 'wojsamjan',
 'description': None,
 'timestamp': 1744493184.0,
 'created_at': '2025-04-12T21:26:24Z'}

In [63]:
#create a table
conn.execute("""
  CREATE TABLE IF NOT EXISTS events (
    login TEXT PRIMARY KEY,
    description TEXT,
    timestamp DOUBLE,
    createdat STRING
  )
""")

#id TEXT PRIMARY KEY,

<duckdb.duckdb.DuckDBPyConnection at 0x7a9822143eb0>

In [64]:
flattened_events = [
    (
      #event['id'],
      event['login'],
      event['description'],
      event['timestamp'],
      event['created_at']
      #datetime.strptime(event['created_at'], '%Y-%m-%dT%H:%M:%SZ').timestamp(),
      #datetime.strptime(event['created_at'], '%Y-%m-%dT%H:%M:%SZ')
    )
    for event in processed_events
]

conn.executemany("""
  INSERT INTO events (login, description, timestamp, createdat)
  VALUES (?, ?, ?, ?)
""", flattened_events)

<duckdb.duckdb.DuckDBPyConnection at 0x7a9822143eb0>

In [65]:
df = conn.execute("SELECT * FROM events").fetchdf()
df.head()

Unnamed: 0,id,login,description,timestamp,createdat
0,,wojsamjan,,1744493000.0,2025-04-12 21:26:24
1,,jrgealexq,,1744488000.0,2025-04-12 20:01:39
2,,ovidiu-eremia,,1744483000.0,2025-04-12 18:29:45
3,,ggledis,,1744482000.0,2025-04-12 18:28:05
4,,ggledis,,1744482000.0,2025-04-12 18:27:59


In [66]:
conn.close()