In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, when

class DataLoader:
    def __init__(self):
        self.spark = SparkSession.builder.appName("Ejemplo").getOrCreate()

    def load_file(self, file_location):
        file_type = "csv"

        # CSV options
        infer_schema = "false"
        first_row_is_header = "true"
        delimiter = ","

        # Cargar el archivo
        df = self.spark.read.format(file_type) \
            .option("inferSchema", infer_schema) \
            .option("header", first_row_is_header) \
            .option("sep", delimiter) \
            .load(file_location)
        return df

class DataProcessor:
    def __init__(self):
        self.data_loader = DataLoader()
        self.spark = self.data_loader.spark

    def tabla_salida(self, df):
        window = Window.partitionBy('nationality', 'team_position').orderBy(col('overall').desc())
        df = df.withColumn('ranking', rank().over(window))
        df = df.withColumn(
            'player_cat',
            when(col('ranking') <= 3, 'A')
            .when(col('ranking') <= 5, 'B')
            .when(col('ranking') <= 10, 'C')
            .otherwise('D')
        )
        tabla_salida = df.select(
            'short_name', 'long_name', 'age', 'height_cm', 'weight_kg', 'nationality',
            'club_name', 'overall', 'potential', 'team_position', 'player_cat'
        )
        return tabla_salida

    def filtrado(self, tabla_salida):
        tabla_salida = tabla_salida.withColumn('potential_vs_overall', col('potential') / col('overall'))
        df_filtrado = tabla_salida.filter(
            (col('player_cat').isin('A', 'B')) |
            ((col('player_cat') == 'C') & (col('potential_vs_overall') > 1.15)) |
            ((col('player_cat') == 'D') & (col('potential_vs_overall') > 1.25))
        )
        return df_filtrado

if __name__ == "__main__":
    data_processor = DataProcessor()
    df = data_processor.data_loader.load_file("/FileStore/tables/Libro1-3.csv")

    tabla_salida = data_processor.tabla_salida(df)
    tabla_salida.show()

    df_filtrado = data_processor.filtrado(tabla_salida)
    df_filtrado.show()
