In [None]:
# Set Spark environment variables
import os
from pyspark.sql import SparkSession

os.environ["SPARK_HOME"] = "/opt/spark"  # Path where Spark is installed
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["SPARK_HOME"], "bin")
os.environ["PYSPARK_PYTHON"] = "python3"  # Ensure it uses Python 3
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"  # For Jupyter Notebook

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("PySparkTest") \
    .getOrCreate()


# Get the current Spark context
sc = spark.sparkContext

# Check the master URL and number of cores
print(f"Spark is running on master: {sc.master}")
print(f"Number of cores: {sc.defaultParallelism}")



In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
import requests
import json

# Initialize Spark session (already done in your example)
spark = SparkSession.builder.appName("APIParallelization").getOrCreate()
sc = spark.sparkContext

# Your original function for API calls
def get_assessment_scores(_id):
    base_url = "your_api_endpoint_here"
    url = f"{base_url}/{_id}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()  # Your logic to process the response
            return data
        else:
            return None
    except Exception as e:
        print(f"Error fetching data for {_id}: {str(e)}")
        return None

# List of IDs that you want to process
id_list = [1, 2, 3, 4, 5]  # Your list of IDs

# Parallelize the list of IDs and apply the API call function
rdd = sc.parallelize(id_list)

# Use `map` to apply the `get_assessment_scores` function in parallel
result = rdd.map(get_assessment_scores).collect()  # `collect()` gathers results back to the driver

# Print the result (or further process the data)
print(result)


In [None]:
%load_ext autoreload
%autoreload 2

from modules.auth import *
from modules.assessments_endpoints import *
from modules.frame_transformations import *
from modules.assessment_endpoint_spark import *
from modules.config import base_url_illuminate
import logging
import os
import sys


log_dir = "logs"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Log file path
log_file_path = os.path.join(log_dir, "illuminate_log.txt")

# Configure logging to use both StreamHandler (stdout) and FileHandler (log file)
logging.basicConfig(
    level=logging.INFO,  # Adjust as needed (e.g., DEBUG, WARNING)
    format="%(asctime)s - %(message)s",  # Log format
    datefmt="%d-%b-%y %H:%M:%S",  # Date format
    handlers=[
        logging.StreamHandler(sys.stdout),  # Direct logs to stdout
        logging.FileHandler(log_file_path, mode='a')  # Append to the log file
    ],
    force=True  # Ensures existing handlers are replaced
)


start_date = '2024-07-01'

access_token, expires_in = get_access_token()

assessments_df, assessment_id_list = get_all_assessments_metadata(access_token)
# assessment_id_list = assessment_id_list[:500]

# test_results_standard, log_results_standard = loop_through_assessment_scores(access_token, assessment_id_list, 'Standard', start_date, end_date_override=None)


# Call the parallelized version of the function, spark session initiated in assessment_endpoint_spark module
test_results_group, log_results_group = parallel_get_assessment_scores(
    access_token, assessment_id_list, 'Standard', start_date, end_date_override=None
)


#took 7 seconds with pyspark for the first 50
#took 9 seconds with pandas for the first 50

#took 47  seconds with pypark for the first 500
# too 2 mins 36 seconds with pandas for the first 500

#took 8 mins with pypark for all
#took 17 mins with pandas for all


In [24]:
#Next steps, 
#Make module into 1

#Create spark session in main script
#Merge branch with main for feauture enhancement practice. 

#Add to requirements.txt
#Re-initaite docker with new tag of spark
#Test in airflow

test_results_group


Unnamed: 0,assessment_id,title,academic_benchmark_guid,standard_code,standard_description,local_student_id,first_name,last_name,middle_name,date_taken,points,points_possible,percent_correct,performance_band_level,performance_band_label,mastered,__count,Standard_No_Standard
0,107620,1st Grade - IM Unit 2 - Section C Checkpoint,A65A4396-6F89-11DF-BAEE-EA329DFF4B22,CCSS.Math.Content.1.OA.C.5,Relate counting to addition and subtraction (e...,601867,Eva,Wairegi,Kalekye,2024-10-11,3,5,60,2,Approaching Expectations,f,222,Standard
1,107620,1st Grade - IM Unit 2 - Section C Checkpoint,A658F1E4-6F89-11DF-BAEE-EA329DFF4B22,CCSS.Math.Content.1.OA.B.4,Understand subtraction as an unknown-addend pr...,602229,De'yon,Carter Jr,Earronda,2024-10-11,3,5,60,2,Approaching Expectations,f,222,Standard
2,107620,1st Grade - IM Unit 2 - Section C Checkpoint,A658F1E4-6F89-11DF-BAEE-EA329DFF4B22,CCSS.Math.Content.1.OA.B.4,Understand subtraction as an unknown-addend pr...,602284,Denym,Cagle,Elise,2024-10-11,2,5,40,1,Does Not Meet Expectations,f,222,Standard
3,107620,1st Grade - IM Unit 2 - Section C Checkpoint,A658F1E4-6F89-11DF-BAEE-EA329DFF4B22,CCSS.Math.Content.1.OA.B.4,Understand subtraction as an unknown-addend pr...,601956,Auness,Brown,Mary Renee,2024-10-11,4,5,80,3,Meets Expectations,t,222,Standard
4,107620,1st Grade - IM Unit 2 - Section C Checkpoint,A65B50C4-6F89-11DF-BAEE-EA329DFF4B22,CCSS.Math.Content.1.OA.C.6,"Add and subtract within 20, demonstrating flue...",602142,Carden,Stewart,Chad Jeremiah,2024-10-11,5,5,100,4,Exceeds Expectations,t,222,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93084,141481,Unit 2 Assessment: Modeling One-Variable Quan...,C640DCD0-96FF-11E0-9509-C03D9DFF4B22,S-ID.2,Use statistics appropriate to the shape of the...,10367,Tristan,Shepherd,R,2024-11-04,3,4,75,3,Meets Expectations,t,34,Standard
93085,141481,Unit 2 Assessment: Modeling One-Variable Quan...,C640DCD0-96FF-11E0-9509-C03D9DFF4B22,S-ID.2,Use statistics appropriate to the shape of the...,44232,A'Nyah,Tolbert,Denise,2024-11-04,1,4,25,1,Does Not Meet Expectations,f,34,Standard
93086,141481,Unit 2 Assessment: Modeling One-Variable Quan...,C640DCD0-96FF-11E0-9509-C03D9DFF4B22,S-ID.2,Use statistics appropriate to the shape of the...,45118,Tania,Watson,N,2024-11-04,2,4,50,1,Does Not Meet Expectations,f,34,Standard
93087,141481,Unit 2 Assessment: Modeling One-Variable Quan...,C640DCD0-96FF-11E0-9509-C03D9DFF4B22,S-ID.2,Use statistics appropriate to the shape of the...,12282013,Victoria,Maskall,E,2024-11-04,4,4,100,4,Exceeds Expectations,t,34,Standard
