In [11]:
# PySpark Imports
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext, DataFrameWriter
from pyspark.sql.functions import isnan, when, count

# PySpark ML Imports
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.param import Param, Params
from pyspark.ml.feature import Bucketizer, VectorAssembler, StringIndexer

# Other Imports
import pandas as pd
import duckdb
import numpy as np
import os
import sys

# System paths
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Database path
DATABASE_PATH = "../database/DDBB_duckdb.duckdb"


def fetch_duckdb() -> list[pd.DataFrame]:
    """
    Fetches all the required data from the database and returns an array of dataframes.
    TEMP: Only data from the movies table is being fetched right now. Expand to writers
    
    :param
    """
    con = duckdb.connect(database=DATABASE_PATH, read_only=False)
    df = con.execute('''
        WITH director_avg_scores AS (
        SELECT 
            d.director_id,
            COALESCE(SUM(CASE WHEN m.label THEN 1 ELSE 0 END) / NULLIF(COUNT(*), 0), 0.5) AS director_avg_score
        FROM 
            directing d
        INNER JOIN 
            movies m ON d.movie_id = m.movie_id
        WHERE 
            m.subset = 'train'
        GROUP BY 
            d.director_id
    ),
    director_scores AS (
        SELECT 
            d.movie_id,
            COUNT(d.director_id) AS director_count,
            AVG(COALESCE(das.director_avg_score, 0.5)) AS director_avg_score
        FROM 
            directing d
        LEFT JOIN 
            director_avg_scores das ON das.director_id = d.director_id
        GROUP BY 
            d.movie_id
    ),
    writer_avg_scores AS (
        SELECT 
            w.writer_id,
            COALESCE(SUM(CASE WHEN m.label THEN 1 ELSE 0 END) / NULLIF(COUNT(*), 0), 0.5) AS writer_avg_score
        FROM 
            writing w
        INNER JOIN 
            movies m ON w.movie_id = m.movie_id
        WHERE 
            m.subset = 'train'
        GROUP BY 
            w.writer_id
    ),
    writer_scores AS (
        SELECT 
            w.movie_id,
            COUNT(w.writer_id) AS writer_count,
            AVG(COALESCE(was.writer_avg_score, 0.5)) AS writer_avg_score
        FROM 
            writing w
        LEFT JOIN 
            writer_avg_scores was ON w.writer_id = was.writer_id
        GROUP BY 
            w.movie_id
    )
    SELECT
        m.subset, 
        m.movie_id,
        m.num_votes,
        m.runtime_min,
        m.title_length,
        ds.director_avg_score,
        COALESCE(ds.director_count, 0) AS director_count,
        CASE WHEN m.label THEN 1 ELSE 0 END AS label,
        ws.writer_avg_score,
        COALESCE(ws.writer_count, 0) AS writer_count
    FROM 
        movies m
    LEFT JOIN 
        director_scores ds ON m.movie_id = ds.movie_id
    LEFT JOIN 
        writer_scores ws ON m.movie_id = ws.movie_id;
    ''').fetch_df()
    con.close()
    
    
    train = df[df['subset'] == 'train'].drop(['subset'], axis=1).dropna()
    test = df[df['subset'] == 'test'].drop(['subset', 'label'], axis=1)
    validation = df[df['subset'] == 'val'].drop(['subset', 'label'], axis=1)
    
    return train, test, validation

def generate_pipeline(features: list) -> Pipeline:
    """
    Function to generate the Spark pipeline based on the following operations:
        - Assembling (choosing) the desired features (numeric).
        - Index the selected features to be processed by the pipeline (strings).
        - Initializing the pipeline based on the indexed features.
    
    :param
    """
    assembler = VectorAssembler(inputCols=features, outputCol="features")
    indexer = StringIndexer(inputCol="label").setOutputCol("label-index")
    pipeline = Pipeline().setStages([assembler, indexer])
    return pipeline

def generate_output_pipeline(features: list) -> Pipeline:
    """
    Function to generate the Spark pipeline based on the following operations:
        - Assembling (choosing) the desired features (numeric).
        - Index the selected features to be processed by the pipeline (strings).
        - Initializing the pipeline based on the indexed features.
    
    :param
    """
    assembler = VectorAssembler(inputCols=features, outputCol="features")
    pipeline = Pipeline().setStages([assembler])
    return pipeline

    
def create_submission(model, validation, test, features) -> None:
    """
    Create the required submission file in .csv format
    
    :param model: PySpark generated binary classifier
    """    
    pipeline = generate_output_pipeline(features)
    pipeline_fit = pipeline.fit(validation)
    p_val = pipeline_fit.transform(validation)
    p_test = pipeline_fit.transform(test)
    
    validation = validation.toPandas()  
    test = test.toPandas()  
    
    validation["label"] = model.transform(p_val).select('prediction').collect().tolist
    test["label"] = model.transform(p_test).select('prediction').collect().tolist

    # Cast to bool and store in .csv
    validation["label"].astype(bool).to_csv("val_result.csv", index=False, header=None)
    test["label"].astype(bool).to_csv("test_result.csv", index=False, header=None)

    # Generate final submission
    for file in ["val_result.csv", "test_result.csv"]:
        with open(file, 'r+') as f:
            f.seek(0,2)                    
            size=f.tell()               
            f.truncate(size-2)

def automated_submission() -> None:
    """
    Automates the submision of files to the Azure server for the competition
    
    :param
    """
    
def main() -> None:
    """
    Main PySpark pipeline execution.
    
    :param
    """
    # Initialize PySpark Context
    conf = SparkConf().setAppName("binary-ml-classification")
    sc = SparkContext.getOrCreate(conf)
    sqlContext = SparkSession.builder.getOrCreate()
    
    # Fetch data and process features to obtain a Spark Dataframe
    train, test, validation = fetch_duckdb()
    features = ["runtime_min", "num_votes", "director_avg_score",	"director_count", "writer_avg_score","writer_count"]
    df_train = sqlContext.createDataFrame(train)
    
    # Generate the pipeline
    pipeline = generate_pipeline(features)
    
    # Fit the pipeline using the Spark Dataframe
    pipeline_fit = pipeline.fit(df_train)  
    
    # Generate and train the model
    prepared = pipeline_fit.transform(df_train)
    dt = DecisionTreeClassifier(labelCol = "label-index", featuresCol= "features")
    dt_model = dt.fit(prepared)
    
    # Read output generation files
    df_validation = sqlContext.createDataFrame(validation)
    df_test = sqlContext.createDataFrame(test)

    create_submission(dt_model, df_validation, df_test, features)
    

In [12]:
# from pyspark.sql import SparkSession
# from pyspark.ml.feature import VectorAssembler
# from pyspark.ml.classification import RandomForestClassifier
# from pyspark.ml.evaluation import BinaryClassificationEvaluator

# # Initialize PySpark Context
# spark = SparkSession.builder.appName("binary-ml-classification").getOrCreate()

# # Import and preprocess data
# train, test, validation = fetch_duckdb()

# # Define features list
# features = ["runtime_min", "num_votes", "director_avg_score", "director_count", "writer_avg_score", "writer_count"]

# # Create Spark DataFrames
# df_train = spark.createDataFrame(train)
# df_test = spark.createDataFrame(test)
# df_validation = spark.createDataFrame(validation)

# # Assemble features into a single feature vector
# assembler = VectorAssembler(inputCols=features, outputCol="features")

# # Transform the data
# df_train = assembler.transform(df_train)
# df_validation = assembler.transform(df_validation)
# df_test = assembler.transform(df_test)

# # Define and train RandomForestClassifier
# rf = RandomForestClassifier(labelCol="label", featuresCol="features")
# model = rf.fit(df_train)

# # Evaluate model on validation set
# predictions_validation = model.transform(df_validation)

In [2]:
if __name__ == '__main__':
    main()

Py4JJavaError: An error occurred while calling o307.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 15 in stage 19.0 failed 1 times, most recent failure: Lost task 15.0 in stage 19.0 (TID 263) (146.50.236.253 executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (VectorAssembler$$Lambda$1554/175834413: (struct<runtime_min_double_VectorAssembler_85b8d87e8f24:double,num_votes_double_VectorAssembler_85b8d87e8f24:double,director_avg_score:double,director_count_double_VectorAssembler_85b8d87e8f24:double,writer_avg_score:double,writer_count_double_VectorAssembler_85b8d87e8f24:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:217)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.sql.execution.SparkPlan$$Lambda$4085/1493101680.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:888)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:888)
	at org.apache.spark.rdd.RDD$$Lambda$4086/2027839795.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2657/1376208232.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Encountered NaN while assembling a row with handleInvalid = "error". Consider
removing NaNs from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:264)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler$$$Lambda$4437/101908339.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	at org.apache.spark.ml.feature.VectorAssembler$$Lambda$4408/1510866085.apply(Unknown Source)
	... 21 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler$$Lambda$4898/1609495763.apply(Unknown Source)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler$$Lambda$4896/367474938.apply(Unknown Source)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1019)
	at org.apache.spark.rdd.RDD$$Lambda$4051/1156000764.apply(Unknown Source)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1018)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3997)
	at org.apache.spark.sql.Dataset$$Lambda$4838/373286523.apply(Unknown Source)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.Dataset$$Lambda$2128/48645133.apply(Unknown Source)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset$$Lambda$1775/1934605263.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$1783/1117488226.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$1776/1616610578.apply(Unknown Source)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3994)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (VectorAssembler$$Lambda$1554/175834413: (struct<runtime_min_double_VectorAssembler_85b8d87e8f24:double,num_votes_double_VectorAssembler_85b8d87e8f24:double,director_avg_score:double,director_count_double_VectorAssembler_85b8d87e8f24:double,writer_avg_score:double,writer_count_double_VectorAssembler_85b8d87e8f24:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:217)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.sql.execution.SparkPlan$$Lambda$4085/1493101680.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:888)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:888)
	at org.apache.spark.rdd.RDD$$Lambda$4086/2027839795.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2657/1376208232.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: org.apache.spark.SparkException: Encountered NaN while assembling a row with handleInvalid = "error". Consider
removing NaNs from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:264)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler$$$Lambda$4437/101908339.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	at org.apache.spark.ml.feature.VectorAssembler$$Lambda$4408/1510866085.apply(Unknown Source)
	... 21 more


In [None]:
import requests

# Replace these with your actual file paths
# csv_file1_path = "path/to/your/first/csv/file.csv"
# csv_file2_path = "path/to/your/second/csv/file.csv"

# URL of the submission page
submit_url = "http://big-data-competitions.swedencentral.cloudapp.azure.com:8080/competitions/imdb/submit"

# Authenticate if required
# headers = {"Authorization": "Bearer YOUR_ACCESS_TOKEN"}

# Prepare files for upload
files = {
    "file1": open(csv_file1_path, "rb"),
    "file2": open(csv_file2_path, "rb")
}

# Make the POST request to upload files
response = requests.post(submit_url, files=files)

# Check the response
if response.status_code == 200:
    print("Files submitted successfully!")
else:
    print(f"Error submitting files. Status code: {response.status_code}")
    print(response.text)  # Print any error message from the server