In [1]:
// Import necessary libraries

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.QuantileDiscretizer
import org.apache.spark.ml.Pipeline

// read data from csv into a dataframe
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
var df = sqlContext
    .read
    .format("csv")
    .option("header","true")
    .option("inferSchema","true")
    .option("quote", "\"")
    .option("escape","\"")
    .load("ks-projects-201801.csv")

Intitializing Scala interpreter ...

Spark Web UI available at http://2b282a8d538e:4040
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1590338068681)
SparkSession available as 'spark'


2020-05-24 16:34:24,391 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(62)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.QuantileDiscretizer
import org.apache.spark.ml.Pipeline
sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@6f0d304a
df: org.apache.spark.sql.DataFrame = [ID: int, name: string ... 13 more fields]


In [2]:
// This function creates a feature for the length of the project name sorted into discrete bins
def discretizeNameLength(df:org.apache.spark.sql.DataFrame):org.apache.spark.sql.DataFrame={
    // Get the number of characters of each name
    var tmpdf = df.withColumn("namelength", length($"name"))
    // set number of bins to discretize name length
    val bins = 5
    // Use the built in QuantileDiscretizer to discretize the lenghts into 5 bins
    val discretizerLength = new QuantileDiscretizer()
      .setInputCol("namelength")
      .setOutputCol("namelengthBinned")
      .setNumBuckets(bins)
    // Fit the discretizer to the data and transform the dataframe
    discretizerLength.fit(tmpdf).transform(tmpdf)
        .na.fill(-1,Array("namelengthBinned")) // Any NULL will be filled with -1 to show that the name was empty
        .drop("namelength") // Remove the namelength column as this will not be required and return the transformed DataFrame
}

// This function drops invalid dates
// Kickstarter started in 2009 and any dates before this should be dropped from the dataset
def dropBadDates(df:org.apache.spark.sql.DataFrame):org.apache.spark.sql.DataFrame = {
    df.filter((col("launched")>"2009-01-01 00:00:00")&&(col("deadline")>"2009-01-01 00:00:00"))
}

// This function formats the dates into a consistant format
// It also extracts features including the month and year from the dates as well as the project duration
def formatDates(df:org.apache.spark.sql.DataFrame):org.apache.spark.sql.DataFrame = {
    df
    // The specific time the project is uploaded is not relevant to the model, so only the dates are formated as "yyyy-MM-dd"
    .withColumn("launched", date_format(col("launched"), "yyyy-MM-dd"))
    .withColumn("deadline", date_format(col("deadline"), "yyyy-MM-dd"))
    // Extract the month from the dates as an additional feature
    .withColumn("launched_month", date_format(col("launched"), "MM"))
    .withColumn("deadline_month", date_format(col("deadline"), "MM"))
    // Extract the year from the dates as an additional feature
    .withColumn("launched_year", date_format(col("launched"), "yyyy"))
    .withColumn("deadline_year", date_format(col("deadline"), "yyyy"))
    // Create a feature to show how long the project was opened for
    .withColumn("Duration(Days)",datediff(col("deadline"),col("launched")))
}

// This function creates a feature which converts the project state column to a binary outcome
// It drops live and undefined projects as the outcome is undefined
// a 1 is assigned for "Successful" and 0 for everything else
def binaryState(df:org.apache.spark.sql.DataFrame):org.apache.spark.sql.DataFrame={   
    df.filter((col("state")!=="live")&&(col("state")!=="undefined"))
    .withColumn("stateBinary", (col("state")==="successful")
    .cast(IntegerType))
}

def discretizeContinousVar(df:org.apache.spark.sql.DataFrame):org.apache.spark.sql.DataFrame = {
    // The below code is modified from an example posted by user jarias 
    // on stackoverflow:
    // https://stackoverflow.com/questions/43639252/how-to-use-spark-quantilediscretizer-on-multiple-columns

    // number of discrete bins
    val bins = 10

    // define columns with DoubleType as continuous
    val continuous = df.dtypes.filter(_._2 == "DoubleType").map (_._1)

    // apply QuantileDiscretizer on continuous columns, output as colname_discrete
    val discretizer = new QuantileDiscretizer()
      .setInputCols(continuous)
      .setOutputCols(continuous.map(x => s"${x}_discrete"))
      .setNumBuckets(bins) // set bins

    // apply discretizer to dataframe
    val pipeline = new Pipeline().setStages(Array(discretizer))
    val model = pipeline.fit(df)
    model.transform(df)
}

// This function creates dummy variables for selected categorial columns
def dummyVariables(df:org.apache.spark.sql.DataFrame):org.apache.spark.sql.DataFrame={
    // List the categorical columns to create dummy variables from
    // There are too many categories and this feature is encapsulated by the main_categories so this was left out
    val categoricalCols = Array("country","currency","main_category")
    var tmpdf = df
    // Iterate over each of the categorical columns and get a set of unique values
    for (cols<-categoricalCols){
        val setofcat = tmpdf.select(cols).collect().map(_(0)).toSet
        // For each of the unique values create a new dummy variable to indicate the presence of the value in each row
        for (cat<-setofcat){
            tmpdf = tmpdf.withColumn(cols+"_"+cat, (col(cols) === cat).cast(IntegerType))
        }
    }
   tmpdf
}

// This function checks that the currency and country columns are valid and drops invalid rows
def checkCurrencyCountry(df:org.apache.spark.sql.DataFrame):org.apache.spark.sql.DataFrame={ 
    // valid kickstarter currencies according to: 
    // https://www.kickstarter.com/blog/new-view-kickstarter-in-your-currency
    val currencyList = List("AUD","GBP","CAD","DKK","EUR","HKD",
                            "JPY","MXN","NZD","NOK","SGD","SEK",
                            "CHF","USD")
    // valid country codes for kickstarter based on: 
    // https://help.kickstarter.com/hc/en-us/articles/115005128594-Who-can-use-Kickstarter-
    // and
    // https://www.realifewebdesigns.com/web-marketing/abbreviations-countries.asp
    val validKsCountries = List("NL","MX","AT","HK","AU","CA","GB",
                                "DE","ES","US","FR","CH","SG","IT",
                                "SE","JP","NZ","IE","BE","NO","LU",
                                "DK") 

    // drops rows with invalid currencies or countries (if there are any)
    df.filter((col("currency").isin(currencyList:_*))&&(col("country").isin(validKsCountries:_*)))
}

// This function applys a series of functions to a dataframe and returns the transformed dataframe
def pipeline(df:org.apache.spark.sql.DataFrame, fns:org.apache.spark.sql.DataFrame=>org.apache.spark.sql.DataFrame*) = {
    var tmpdf = df
    for (fn<-fns) {
        tmpdf = fn(tmpdf)
    }
    tmpdf
}

discretizeNameLength: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
dropBadDates: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
formatDates: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
binaryState: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
discretizeContinousVar: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
dummyVariables: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
checkCurrencyCountry: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
pipeline: (df: org.apache.spark.sql.DataFrame, fns: org.apache.spark.sql.DataFrame => org.apache.spark.sql.DataFrame*)org.apache.spark.sql.DataFrame


In [3]:
// Apply the pipeline fucntion to get a transformed and cleaned dataframe
val cleaneddf = pipeline(df, discretizeContinousVar,
                        discretizeNameLength,
                        dropBadDates,
                        checkCurrencyCountry,
                        formatDates,
                        binaryState,
                        dummyVariables)



cleaneddf: org.apache.spark.sql.DataFrame = [ID: int, name: string ... 76 more fields]


In [4]:
// Send cleaned data to a csv file
cleaneddf.coalesce(1).write.mode("overwrite")
                    .option("header","true")
                    .option("quote", "\"")
                    .option("escape","\"")
                    .csv("ks-projects-201801_cleaned.csv")