In [12]:
import pandas as pd
direccion = "./../data/raw/prueba.csv"
df = pd.read_csv(direccion, nrows = 10000)
df.head()

df.Cancelled.unique()

array([0., 1.])

In [18]:
direccion = "./../data/raw/prueba.csv"

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.csv(direccion, header="true", inferSchema="true").limit(20000)

In [19]:
def clean(df):
    #Pasar a minusculas los nombres de columnas
    for col in df.columns:
        df = df.withColumnRenamed(col, col.lower())

    #Seleccionar columnas no vacias 

    base = df.select(df.year,df.quarter, df.month, df.dayofmonth, df.dayofweek, df.flightdate, df.reporting_airline, df.dot_id_reporting_airline, df.iata_code_reporting_airline, df.tail_number, df.flight_number_reporting_airline, df.originairportid, df.originairportseqid, df.origincitymarketid, df.origin, df.origincityname, df.originstate, df.originstatefips, df.originstatename, df.originwac, df.destairportid, df.destairportseqid, df.destcitymarketid, df.dest, df.destcityname, df.deststate, df.deststatefips, df.deststatename, df.destwac, df.crsdeptime, df.deptime, df.depdelay, df.depdelayminutes, df.depdel15, df.departuredelaygroups, df.deptimeblk, df.taxiout, df.wheelsoff, df.wheelson, df.taxiin, df.crsarrtime, df.arrtime, df.arrdelay, df.arrdelayminutes, df.arrdel15, df.arrivaldelaygroups, df.arrtimeblk, df.cancelled, df.diverted, df.crselapsedtime, df.actualelapsedtime, df.airtime, df.flights, df.distance, df.distancegroup, df.divairportlandings )

    #agregar columna con clasificación de tiempo en horas de atraso del vuelo 0-1.5, 1.5-3.5,3.5-, cancelled

    from pyspark.sql import functions as f
    base = base.withColumn('rangoatrasohoras', f.when(f.col('cancelled') == 1, "cancelled").when(f.col('depdelayminutes') < 90, "0-1.5").when((f.col('depdelayminutes') > 90) & (f.col('depdelayminutes')<210), "1.5-3.5").otherwise("3.5-"))

    from pyspark.sql.functions import udf
    from pyspark.sql.types import StringType
    from pyspark.sql.functions import col, lower, regexp_replace, split

    #Función limpieza
    def clean_text(c):
        c = lower(c)
        c = regexp_replace(c, " ", "_")
        c = f.split(c, '\,')[0]
        return c


     # Aplicación de la función limpieza
    base = base.withColumn("origincityname", clean_text(col("origincityname")))
    base = base.withColumn("destcityname", clean_text(col("destcityname")))
    return base

In [20]:
df = clean(df)

In [21]:
from pyspark.ml import Pipeline


def ignore_list(df, data_types):
    from pyspark.sql.functions import countDistinct, approxCountDistinct
    counts_summary = df.agg(*[countDistinct(c).alias(c) for c in data_types["StringType"]])
    counts_summary = counts_summary.toPandas()

    import pandas as pd
    counts = pd.Series(counts_summary.values.ravel())
    counts.index = counts_summary.columns

    sorted_vars = counts.sort_values(ascending = False)
    ignore = list((sorted_vars[sorted_vars >100]).index)
    return ignore

def get_data_types(df):
    from collections import defaultdict
    data_types = defaultdict(list)
    for entry in df.schema.fields:
        data_types[str(entry.dataType)].append(entry.name)
    return data_types

def create_pipeline(df):
    # Esto lo ponemos aqui para poder modificar las 
    #variables de los estimadores/transformadores
    
    data_types = get_data_types(df)
    ignore =   ignore_list(df, data_types) 
        
    #--------------------------------------
    
    # -------------- STRING --------------
    strings_used = [var for var in data_types["StringType"] if var not in ignore]

    # -------------- DOUBLE --------------
    numericals_double = [var for var in data_types["DoubleType"] if var not in ignore]
    numericals_double_imputed = [var + "_imputed" for var in numericals_double]

    # -------------- INTEGERS --------------
    for c in data_types["IntegerType"]:
        df = df.withColumn(c+ "_cast_to_double", df[c].cast("double"))

    numericals_int = [var for var in  df.columns if var.endswith("_cast_to_double")]  
    numericals_int = [var for var in numericals_int if var not in ignore] 
    numericals_int_imputed = [var + "_imputed" for var in numericals_int]
    # =======================================

    ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    ##            P I P E L I N E
    ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    # ============= ONE HOT ENCODING ================
    from pyspark.ml.feature import OneHotEncoder, StringIndexer
    stage_string = [StringIndexer(inputCol= c, outputCol= c+"_string_encoded") for c in strings_used]
    stage_one_hot = [OneHotEncoder(inputCol= c+"_string_encoded", outputCol= c+ "_one_hot") for c in strings_used]

    # =============== IMPUTADORES ====================
    from pyspark.ml.feature import Imputer
    stage_imputer_double = Imputer(inputCols = numericals_double, 
                                   outputCols = numericals_double_imputed) 
    stage_imputer_int = Imputer(inputCols = numericals_int, 
                                outputCols = numericals_int_imputed) 

    # ============= VECTOR ASESEMBLER ================
    from pyspark.ml.feature import VectorAssembler

    features =  numericals_double_imputed \
              + [var + "_one_hot" for var in strings_used]
    stage_assembler = VectorAssembler(inputCols = features, outputCol= "assem_features")

    # ==================== Standariza =======================
    from pyspark.ml.feature import StandardScaler
    stage_scaler = StandardScaler(inputCol= stage_assembler.getOutputCol(), 
                                  outputCol="scaled_features", withStd=True, withMean=True)

    # ==================== PCA =======================
    from pyspark.ml.feature import PCA
    stage_pca = PCA(k = 15,inputCol = stage_scaler.getOutputCol(), 
                    outputCol = "features")

    # =================== MODELS =====================
    from pyspark.ml.classification import LogisticRegression
    clr = LogisticRegression(maxIter=10, regParam=0.01,
                             fitIntercept=True) 

    # ================== PIPELINE ===================

    pipeline = Pipeline(stages= stage_string + stage_one_hot +          # Categorical Data
                              [stage_imputer_double,
                               stage_imputer_int,                       # Data Imputation
                               stage_assembler,                         # Assembling data
                               stage_scaler,                            # Standardize data
                               stage_pca,                               # Dimensionality Reduction
                               clr
                          ])
    
    paramGrid = ParamGridBuilder() \
    .addGrid(stage_pca.k, [2,3]) \
    .addGrid(clr.maxIter, [2,3]) \
    .build()

    crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice
    
    df_train, df_test = df.randomSplit([0.8,0.2], 123)

    cvModel = crossval.fit(df_train)
    
    # Make predictions on test documents. cvModel uses the best model found (lrModel).
    prediction = cvModel.transform(test)
    print(prediction)
    for row in selected:
        print(row)
        
    ## Tenemos que regesar el pipeline porque las variables int las combierte en double
    return pipeline, crossval, df


def imputa_categoricos(df):
    data_types = get_data_types(df)
    ignore =   ignore_list(df, data_types) 
    strings_used = [var for var in data_types["StringType"] if var not in ignore]
    
    missing_data_fill = {}
    for var in strings_used:
        missing_data_fill[var] = "missing"

    df = df.fillna(missing_data_fill)
    return df



In [20]:
# Separamos en train y test
# Este dataframe ya debe filtrar

df2 = df.withColumnRenamed("cancelled","label")
df2 = imputa_categoricos(df2)

# Tenemos que regesar el pipeline porque las variables int las combierte en double
pipeline, paramGrid, df2 = create_pipeline(df2)
print(pipeline.getStages())
model = pipeline.fit(df2)
df2 = model.transform(df2)

In [None]:
# Esto lo ponemos aqui para poder modificar las 
#variables de los estimadores/transformadores

df = df.withColumnRenamed("cancelled","label")
df = imputa_categoricos(df)


data_types = get_data_types(df)
ignore =   ignore_list(df, data_types) 

#--------------------------------------

# -------------- STRING --------------
strings_used = [var for var in data_types["StringType"] if var not in ignore]

# -------------- DOUBLE --------------
numericals_double = [var for var in data_types["DoubleType"] if var not in ignore]
numericals_double_imputed = [var + "_imputed" for var in numericals_double]

# -------------- INTEGERS --------------
for c in data_types["IntegerType"]:
    df = df.withColumn(c+ "_cast_to_double", df[c].cast("double"))

numericals_int = [var for var in  df.columns if var.endswith("_cast_to_double")]  
numericals_int = [var for var in numericals_int if var not in ignore] 
numericals_int_imputed = [var + "_imputed" for var in numericals_int]
# =======================================

## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
##            P I P E L I N E
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# ============= ONE HOT ENCODING ================
from pyspark.ml.feature import OneHotEncoder, StringIndexer
stage_string = [StringIndexer(inputCol= c, outputCol= c+"_string_encoded") for c in strings_used]
stage_one_hot = [OneHotEncoder(inputCol= c+"_string_encoded", outputCol= c+ "_one_hot") for c in strings_used]

# =============== IMPUTADORES ====================
from pyspark.ml.feature import Imputer
stage_imputer_double = Imputer(inputCols = numericals_double, 
                               outputCols = numericals_double_imputed) 
stage_imputer_int = Imputer(inputCols = numericals_int, 
                            outputCols = numericals_int_imputed) 

# ============= VECTOR ASESEMBLER ================
from pyspark.ml.feature import VectorAssembler

features =  numericals_double_imputed \
          + [var + "_one_hot" for var in strings_used]
stage_assembler = VectorAssembler(inputCols = features, outputCol= "assem_features")

# ==================== Standariza =======================
from pyspark.ml.feature import StandardScaler
stage_scaler = StandardScaler(inputCol= stage_assembler.getOutputCol(), 
                              outputCol="scaled_features", withStd=True, withMean=True)

# ==================== PCA =======================
from pyspark.ml.feature import PCA
stage_pca = PCA(k = 15,inputCol = stage_scaler.getOutputCol(), 
                outputCol = "features")

# =================== MODELS =====================
from pyspark.ml.classification import LogisticRegression
clr = LogisticRegression(maxIter=10, regParam=0.01,
                         fitIntercept=True) 

# ================== PIPELINE ===================

pipeline = Pipeline(stages= stage_string + stage_one_hot +          # Categorical Data
                          [stage_imputer_double,
                           stage_imputer_int,                       # Data Imputation
                           stage_assembler,                         # Assembling data
                           stage_scaler,                            # Standardize data
                           stage_pca,                               # Dimensionality Reduction
                           clr
                      ])


df_train, df_test = df.randomSplit([0.8,0.2], 123)

#model = pipeline.fit(df_train)
#result = model.transform(df_test)

paramGrid = ParamGridBuilder() \
.addGrid(stage_pca.k, [2,3]) \
.addGrid(clr.maxIter, [2,3]) \
.build()

crossval = CrossValidator(estimator=pipeline,
                      estimatorParamMaps=paramGrid,
                      evaluator=BinaryClassificationEvaluator(),
                      numFolds=2)  # use 3+ folds in practice

df_train, df_test = df.randomSplit([0.8,0.2], 123)

cvModel = crossval.fit(df_train)

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(df_test)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(prediction))