In [1]:
import os
import firebase_admin
import requests
from tqdm import tqdm
import time
from firebase_admin import credentials, firestore
import grpc
import json
from google.cloud.firestore_v1.services.firestore import FirestoreClient
from google.cloud.firestore_v1.services.firestore.transports import FirestoreGrpcTransport

In [2]:
# Determine the environment and load the appropriate .env file
env = "development"
if env == "development":
  cred = credentials.Certificate('dev_firebase_config.json')
  firebase_app = firebase_admin.initialize_app(cred)
  db = firestore.client(app=firebase_app)

  # Create a channel and transport for Firestore client
  channel = grpc.insecure_channel("localhost:8080")
  transport = FirestoreGrpcTransport(channel=channel)
  db._firestore_api_internal = FirestoreClient(transport=transport)

else:
  firebase_app = firebase_admin.initialize_app()
  db = firestore.client(app=firebase_app)

In [3]:
# Initialize Firebase Admin using a service account
def fetch_all_courses(base_url, initial_page=1, sleep_time=1):
    courses = []
    page = initial_page

    initial_response = requests.get(f"{base_url}?page={page}")
    if initial_response.status_code != 200:
        print(f"Failed to fetch initial data: HTTP {initial_response.status_code}")
        return courses

    total_pages = initial_response.json()['universita']['totalPages']
    # total_pages = 5
    
    for page in tqdm(range(initial_page, total_pages + 1), desc="Fetching pages"):
        url = f"{base_url}?page={page}"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch data on page {page}: HTTP {response.status_code}")
            break

        data = response.json()
        courses.extend(data['universita']['corsi'])
        time.sleep(sleep_time)

    return courses

def save_to_firestore(courses):
    collection_ref = db.collection(u'courses')
    for course in courses:
        # Use the 'id' field from the course data as the document ID
        doc_id = str(course.get('id'))
        if doc_id:
            doc_ref = collection_ref.document(doc_id)
            doc_ref.set(course)
        else:
            print("Course data is missing 'id' field:", course)
    print("Data has been written to Firestore.")

def capitalize_name(name):
    # Lowercase words that should not be capitalized
    lowercase_words = {'della', 'di', 'e', 'con', 'per', 'dell', 'degli', 'del', 'a', 'da', 'in', 'su', 'per', 'tra', 'fra'}
    words = name.split()
    capitalized_words = [words[0].capitalize()] + [word.capitalize() if word.lower() not in lowercase_words else word.lower() for word in words[1:]]
    return ' '.join(capitalized_words)

In [4]:
# base_url = "https://universitaly-backend.cineca.it/api/offerta-formativa/cerca-corsi"
# all_courses = fetch_all_courses(base_url, sleep_time=5)
# for course in all_courses:
#     course['nomeCorso'] = capitalize_name(course['nomeCorso'])
#     course['nomeStruttura'] = capitalize_name(course['nomeStruttura'])
# # save data to json
# with open('all_courses_data.json', 'w') as f:
#     json.dump(all_courses, f)

In [4]:
# for development
all_courses = json.load(open('test_courses_data.json'))

# Updating course names
for course in all_courses:
    course['nomeCorso'] = capitalize_name(course['nomeCorso'])
    course['nomeStruttura'] = capitalize_name(course['nomeStruttura'])

In [5]:
# Save to Firebase 
save_to_firestore(all_courses)

Data has been written to Firestore.
