<a href="https://colab.research.google.com/github/EvinMenendezVargas/CS3560-01-6-Restaurant-Food-Delivery-System/blob/main/DataNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
### Before accessing API. Let's check its rate limit:

import requests
import time
from datetime import datetime

def check_rate_limit(url):
    """Check GitHub API rate limit status"""
    response = requests.get(url)

    print(f"Status Code: {response.status_code}")
    print("\n=== RATE LIMIT HEADERS ===")

    # GitHub API rate limit headers
    rate_headers = {
        'X-RateLimit-Limit': 'Total requests allowed per hour',
        'X-RateLimit-Remaining': 'Remaining requests',
        'X-RateLimit-Reset': 'Reset time (Unix timestamp)',
        'X-RateLimit-Used': 'Requests used',
        'X-RateLimit-Resource': 'Rate limit type (core, search, etc.)'
    }

    for header, description in rate_headers.items():
        value = response.headers.get(header, 'Not found')
        print(f"{header}: {value} ({description})")

    # Calculate waiting time if rate limited
    if response.status_code == 403:
        reset_time = response.headers.get('X-RateLimit-Reset')
        if reset_time:
            reset_timestamp = int(reset_time)
            current_timestamp = int(time.time())
            wait_seconds = reset_timestamp - current_timestamp
            wait_minutes = wait_seconds / 60

            reset_datetime = datetime.fromtimestamp(reset_timestamp)

            print(f"\n🚫 RATE LIMITED!")
            print(f"Wait time: {wait_seconds} seconds ({wait_minutes:.1f} minutes)")
            print(f"Rate limit resets at: {reset_datetime}")

        # Show error message
        try:
            error_data = response.json()
            print(f"Error message: {error_data.get('message', 'No message')}")
        except:
            print("Could not parse error message")

    return response

# Test on GitHub API
url = "https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/events"
response = check_rate_limit(url)

Status Code: 200

=== RATE LIMIT HEADERS ===
X-RateLimit-Limit: 60 (Total requests allowed per hour)
X-RateLimit-Remaining: 58 (Remaining requests)
X-RateLimit-Reset: 1752701317 (Reset time (Unix timestamp))
X-RateLimit-Used: 2 (Requests used)
X-RateLimit-Resource: core (Rate limit type (core, search, etc.))


In [14]:
import requests
import json
import pandas as pd
import urllib3

# Add this line to disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def fetch_batch_data(url):
    response = requests.get(url, verify=False, timeout=30)
    data = response.json()
    with open("api_data.json", "w") as file:
        json.dump(data, file)
    return data

data=fetch_batch_data("https://api.github.com/repos/DataTalksClub/data-engineering-zoomcamp/events")

if data:
    df = pd.DataFrame(data)
    print(df.head())

            id        type                                              actor  \
0  52161217416  WatchEvent  {'id': 209787373, 'login': 'PunnnYutt', 'displ...   
1  52158741336  WatchEvent  {'id': 146030836, 'login': 'Ikra11', 'display_...   
2  52156736196   ForkEvent  {'id': 78231106, 'login': 'Kachu1212', 'displa...   
3  52153479182  WatchEvent  {'id': 1277699, 'login': 'raj8844', 'display_l...   
4  52152763141  WatchEvent  {'id': 96905082, 'login': 'RijalChirayu', 'dis...   

                                                repo  \
0  {'id': 419661684, 'name': 'DataTalksClub/data-...   
1  {'id': 419661684, 'name': 'DataTalksClub/data-...   
2  {'id': 419661684, 'name': 'DataTalksClub/data-...   
3  {'id': 419661684, 'name': 'DataTalksClub/data-...   
4  {'id': 419661684, 'name': 'DataTalksClub/data-...   

                                             payload  public  \
0                              {'action': 'started'}    True   
1                              {'action': 'start

In [17]:
    with open("all_events.json", "r") as file:
        data = json.load(file)
df = pd.DataFrame(data)
print(df.head())

            id        type                                              actor  \
0  52137521270   ForkEvent  {'id': 89344406, 'login': 'itsyoutalend', 'dis...   
1  52136685716  WatchEvent  {'id': 7132448, 'login': 'rdudeja', 'display_l...   
2  52135250884  WatchEvent  {'id': 48865620, 'login': 'ritazhousmile', 'di...   
3  52131888490   ForkEvent  {'id': 150082916, 'login': 'fxu191', 'display_...   
4  52130233973  WatchEvent  {'id': 220433772, 'login': 'zeynel6199', 'disp...   

                                                repo  \
0  {'id': 419661684, 'name': 'DataTalksClub/data-...   
1  {'id': 419661684, 'name': 'DataTalksClub/data-...   
2  {'id': 419661684, 'name': 'DataTalksClub/data-...   
3  {'id': 419661684, 'name': 'DataTalksClub/data-...   
4  {'id': 419661684, 'name': 'DataTalksClub/data-...   

                                             payload  public  \
0  {'forkee': {'id': 1020543376, 'node_id': 'R_kg...    True   
1                              {'action': 'start

In [24]:
import pandas as pd
import json
from datetime import datetime

with open("api_data.json", "r") as file:
    all_data = json.load(file)

def process_event(event):
  result = {}

  result['id'] = event['id']
  result['type'] = event['type']
  result['public'] = event['public']

  parsed_timestamp = datetime.fromisoformat(event['created_at'])
  result['created_at'] = parsed_timestamp.timestamp()

  result['actor__id'] = event['actor']['id']
  result['actor__login'] = event['actor']['login']

  topics = event.get('payload', {}).get('pull_request', {}).get('base', {}).get('repo', {}).get('topics', [])

  # Create individual topic records
  #processed_topics = [{'event_id': event['id'], 'topic_name': topic} for topic in topics

  processed_topics = []
  for topic in topics:
    processed_topic = {
        'event_id': event['id'],
        'topic_name': topic

    }
    processed_topics.append(processed_topic)
  return result, processed_topics

### Process and normalize data

processed_events = []
processed_topics = []

for event in all_data :
  processed_event, topics = process_event(event)
  processed_events.append(processed_event)
  processed_topics.extend(topics)

df_topics = pd.DataFrame(processed_topics)
print(df_topics)
df = pd.DataFrame(processed_events)
print(df)

# # # Group by event_id and aggregate topics into lists
df_topics_agg = df_topics.groupby('event_id')['topic_name'].apply(list).reset_index()
df_topics_agg.columns = ['id', 'topic_lists']

print("Summary DataFrame with unique IDs and topic lists:")
print(df_topics_agg)

### Perform full outer join to merge df and df_topics_agg

df_merged = df.merge(df_topics_agg, on='id', how='outer')

## Handle missing values - fill NaN topic_lists with empty lists
df_merged['topic_lists'] = df_merged['topic_lists'].apply(lambda x: x if isinstance(x, list) else [])
df_merged.tail(20)

       event_id        topic_name
0   52146985881  data-engineering
1   52146985881               dbt
2   52146985881            docker
3   52146985881             kafka
4   52146985881            kestra
5   52146985881             spark
6   52146965061  data-engineering
7   52146965061               dbt
8   52146965061            docker
9   52146965061             kafka
10  52146965061            kestra
11  52146965061             spark
             id              type  public    created_at  actor__id  \
0   52161217416        WatchEvent    True  1.752673e+09  209787373   
1   52158741336        WatchEvent    True  1.752670e+09  146030836   
2   52156736196         ForkEvent    True  1.752667e+09   78231106   
3   52153479182        WatchEvent    True  1.752663e+09    1277699   
4   52152763141        WatchEvent    True  1.752662e+09   96905082   
5   52151408410        WatchEvent    True  1.752660e+09  189883432   
6   52149065099        WatchEvent    True  1.752657e+09  108585322  

Unnamed: 0,id,type,public,created_at,actor__id,actor__login,topic_lists
10,52123088600,WatchEvent,True,1752604000.0,220941820,RJ-DataHacks,[]
11,52123619539,WatchEvent,True,1752605000.0,117845465,Scottymichaelmillerguy,[]
12,52128184772,WatchEvent,True,1752613000.0,38223302,iagosaito,[]
13,52130233973,WatchEvent,True,1752617000.0,220433772,zeynel6199,[]
14,52131888490,ForkEvent,True,1752621000.0,150082916,fxu191,[]
15,52135250884,WatchEvent,True,1752630000.0,48865620,ritazhousmile,[]
16,52136685716,WatchEvent,True,1752634000.0,7132448,rdudeja,[]
17,52137521270,ForkEvent,True,1752637000.0,89344406,itsyoutalend,[]
18,52141622890,ForkEvent,True,1752646000.0,91791872,NirbhayS46,[]
19,52146965061,PullRequestEvent,True,1752654000.0,160558542,hnahtneyugn,"[data-engineering, dbt, docker, kafka, kestra,..."
