In [None]:
#!/bin/bash
#$ -l h_rt=4:00:00  #time needed
#$ -pe smp 20 #number of cores
#$ -l rmem=15G #number of memery
#$ -o COM6012_Assignment2.output #This is where your output and errors are logged.
#$ -j y # normal and error outputs into a single file (the file above)
#$ -m ea #Email you when it finished or aborted
#$ -cwd # Run job from current directory
#$ -P rse-com6012
#$ -q rse-com6012.q

import os
import subprocess
def module(*args):        
    if isinstance(args[0], list):        
        args = args[0]        
    else:        
        args = list(args)        
    (output, error) = subprocess.Popen(['/usr/bin/modulecmd', 'python'] + args, stdout=subprocess.PIPE).communicate()
    exec(output)    
module('load', 'apps/java/jdk1.8.0_102/binary')    
os.environ['PYSPARK_PYTHON'] = os.environ['HOME'] + '/.conda/envs/jupyter-spark/bin/python'


import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[20]") \
    .appName("Question 1") \
    .config("spark.local.dir","/fastdata/acp18dck") \
    .config("spark.executor.memory", "15g") \
    .config("spark.executor.cores", "20") \
    .config("spark.driver.memory", "15g") \
    .getOrCreate()

sc = spark.sparkContext
os.environ['PATH'] = os.environ['HOME'] + '/.conda/envs/jupyter-spark/bin/'

### Assignment Question 1.1

In [None]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType

schema = StructType([
    StructField('label', DoubleType()), StructField('lepton_pT', DoubleType()), StructField('lepton_eta', DoubleType()),
    StructField('lepton_phi', DoubleType()), StructField('missing_energy_mag', DoubleType()), StructField('missing_energy_phi', DoubleType()),
    StructField('jet_1_pt', DoubleType()), StructField('jet_1_eta', DoubleType()), StructField('jet_1_phi', DoubleType()),
    StructField('jet_1_b-tag', DoubleType()), StructField('jet_2_pt', DoubleType()), StructField('jet_2_eta', DoubleType()),
    StructField('jet_2_phi', DoubleType()), StructField('jet_2_b-tag', DoubleType()), StructField('jet_3_pt', DoubleType()),
    StructField('jet_3_eta', DoubleType()), StructField('jet_3_phi', DoubleType()), StructField('jet_3_b-tag', DoubleType()),
    StructField('jet_4_pt', DoubleType()), StructField('jet_4_eta', DoubleType()), StructField('jet_4_phi', DoubleType()),
    StructField('jet_4_b-tag', DoubleType()), StructField('m_jj', DoubleType()), StructField('m_jjj', DoubleType()),
    StructField('m_lv', DoubleType()), StructField('m_jlv', DoubleType()), StructField('m_bb', DoubleType()),
    StructField('m_wbb', DoubleType()), StructField('m_wwbb', DoubleType())
])

## load the higgs dataset with defines schema

higgsData = spark.read.load('Data/HIGGS.csv.gz', format = 'csv', sep = ',', schema = schema, header = 'false').cache()
higgsSubset, remainder = higgsData.randomSplit([0.25, 0.75], 9)

import time
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, Binarizer, HashingTF
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

schemaNames = higgsSubset.schema.names
ncolumns = len(higgsSubset.columns)

## Vectorise the features and full dataset into training and test set

assembler = VectorAssembler(inputCols = schemaNames[1: ncolumns - 1], outputCol = 'features')
subsetVector = assembler.transform(higgsSubset)
higgsSubsetData = subsetVector.select('label', 'features')
trainingData, testData = higgsSubsetData.randomSplit([0.7, 0.3], 11)

In [None]:
dtcStart = time.time()

##### Decision Tree Classifier ##### 

decisionTreeClassifier = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'features', maxDepth = 10, impurity = 'entropy')
dtcPipe = Pipeline(stages = [decisionTreeClassifier])
dtcEvaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy')
dtcGrid = ParamGridBuilder().addGrid(32, [5, 10, 20]).build()
dtcValidator = CrossValidator(estimator = dtcPipe, estimatorParamMaps = dtcGrid, evaluator = dtcEvaluator, numFolds = 5)

dtcModel = dtcValidator.fit(trainingData)
print('List of Features and their Importance for Decision Tree Classifier')
print(dtcModel.bestModel.stages[-1].featureImportances)

dtcEnd = time.time()
print(dtcEnd - dtcStart)

In [None]:
dtrStart = time.time()

##### Decision Tree Regressor ##### 

decisionTreeRegressor = DecisionTreeRegressor(labelCol = 'label', featuresCol = 'features', predictionCol = 'predNonBinary', maxDepth = 10)
binarizer = Binarizer(threshold = 0.5, inputCol = 'predNonBinary', outputCol = 'prediction')
dtrPipe = Pipeline(stages = [decisionTreeRegressor, binarizer])
dtrEvaluator = BinaryClassificationEvaluator(labelCol = 'label', rawPredictionCol = 'prediction', metricName = 'areaUnderROC')
dtrGrid = ParamGridBuilder().addGrid(32, [5, 10, 20]).build()
dtrValidator = CrossValidator(estimator = dtrPipe, estimatorParamMaps = dtrGrid, evaluator = dtrEvaluator, numFolds = 5)

dtrModel = dtrValidator.fit(trainingData)
print('List of Features abd their Importance for Decision Tree Regessor')
print(dtrModel.bestModel.stages[0].featureImportances)

dtrEnd = time.time()
print(dtrEnd - dtrStart)

In [None]:
import numpy as np
lrStart = time.time()

##### Logistic Regression Classifier #####

lr = LogisticRegression(labelCol = 'label', featuresCol = 'features', maxIter = 10, regParam = 0.01, family = 'binomial')
lrPipe = Pipeline(stages = [lr])
lrEvaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy')
lrGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01, 0.001]).build()
lrValidator = CrossValidator(estimator = lrPipe, estimatorParamMaps = lrGrid, evaluator = lrEvaluator, numFolds = 5)

lrModel = lrValidator.fit(trainingData)
lrBestModel = lrModel.bestModel.stages[-1].coefficients
print('List of Features and their absolute Importance for Logistic Regression')
print(np.abs(lrBestModel))

lrEnd = time.time()
print(lrEnd - lrStart)

In [None]:
dtcPreds = dtcModel.transform(testData)
dtcAccuracy = dtcEvaluator.evaluate(dtcPreds) 
print('Decision Tree Classifier')
print("Accuracy = %g " % dtcAccuracy)

In [None]:
dtrPreds = dtrModel.transform(testData)
dtrAccuracy = dtrEvaluator.evaluate(dtrPreds)
print('Decision Tree Regressor')
print("Accuracy = %g " % dtrAccuracy)

In [None]:
lrPreds = lrModel.transform(testData)
lrAccuracy = lrEvaluator.evaluate(lrPreds)
print('Logistic Regression')
print("Accuracy = %g" % lrAccuracy)

### Assignment Question 1.2

In [None]:
fullDatasetVector = assembler.transform(higgsData)
higgsData = fullDatasetVector.select('label', 'features')
fullTrainingData, fullTestData = higgsData.randomSplit([0.7, 0.3], 11)

In [None]:
##### Decision Tree Classifier ##### 
dtcFullstart = time.time()
dtcModelFull = dtcValidator.fit(fullTrainingData)
print('List of Features and their Importance for Decision Tree Classifier for full Dataset')
print(dtcModelFull.bestModel.stages[-1].featureImportances)
dtcFullEnd = time.time()

print(dtcFullEnd - dtcFullstart)

In [None]:
##### Decision Tree Regressor ##### 
dtrFullStart = time.time()
dtrModelFull = dtrValidator.fit(fullTrainingData)
print('List of Features and their Importance for Decision Tree Regessor for full Dataset')
print(dtrModelFull.bestModel.stages[0].featureImportances)
dtrFullEnd = time.time()

print(dtrFullEnd - dtrFullStart)

In [None]:
##### Logistic Regression Classifier #####
lrFullStart = time.time()
lrModelFull = lrValidator.fit(fullTrainingData)
print('List of Features and their absolute Importance for Logistic Regression')
lrBestModelFull = lrModelFull.bestModel.stages[-1].coefficients
print(np.abs(lrBestModelFull))
lrFullEnd = time.time()

print(lrFullEnd - lrFullStart)

In [None]:
dtcFullPreds = dtcModelFull.transform(fullTestData)
dtcAccuracyFull = dtcEvaluator.evaluate(dtcFullPreds)
print('Decision Tree Classifier')
print("Accuracy = %g " % dtcAccuracyFull)

In [None]:
dtrFullPreds = dtrModelFull.transform(fullTestData)
dtrAccuracyFull = dtrEvaluator.evaluate(dtrFullPreds)
print('Decision Tree Regressor')
print("Accuracy = %g " % dtrAccuracyFull)

In [None]:
lrFullPreds = lrModelFull.transform(fullTestData)
lrAccuracyFull = lrEvaluator.evaluate(lrFullPreds)
print('Logistic Regression')
print("Accuracy = %g" % lrAccuracyFull)
spark.stop()

### Assignment Question 2.1a

In [None]:
#!/bin/bash
#$ -l h_rt=4:00:00  #time needed
#$ -pe smp 20 #number of cores
#$ -l rmem=15G #number of memery
#$ -o COM6012_Assignment2.output #This is where your output and errors are logged.
#$ -j y # normal and error outputs into a single file (the file above)
#$ -M  #Notify you by email, remove this line if you don't like
#$ -m ea #Email you when it finished or aborted
#$ -cwd # Run job from current directory
#$ -P rse-com6012
#$ -q rse-com6012.q

import os
import subprocess
def module(*args):        
    if isinstance(args[0], list):        
        args = args[0]        
    else:        
        args = list(args)        
    (output, error) = subprocess.Popen(['/usr/bin/modulecmd', 'python'] + args, stdout=subprocess.PIPE).communicate()
    exec(output)    
module('load', 'apps/java/jdk1.8.0_102/binary')    
os.environ['PYSPARK_PYTHON'] = os.environ['HOME'] + '/.conda/envs/jupyter-spark/bin/python'


import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[20]") \
    .appName("Question 2") \
    .config("spark.local.dir","/fastdata/acp18dck") \
    .config("spark.executor.memory", "15g") \
    .config("spark.executor.cores", "20") \
    .config("spark.driver.memory", "15g") \
    .getOrCreate()

sc = spark.sparkContext
os.environ['PATH'] = os.environ['HOME'] + '/.conda/envs/jupyter-spark/bin/'

In [None]:
data = spark.read.load('Data/train_set.csv', format = 'csv', sep = ',', inferSchema = 'true', header = 'true').cache()

In [None]:
import numpy as np
import time

schemaNames = data.schema.names
dataCount = data.count()

matrix = [[None for r in range(len(schemaNames))] for r in range(dataCount)]
# list of lists with length data count each list has schemaNames length

## Method where if the data doesn't exist the piece of data from a previous row where data did exist is used
for x, column in enumerate(schemaNames):
    rowCounter = 0
    lastValue = None
    for row in data.select(column).collect():
        if row[column] is '?':
            matrix[rowCounter][x] = lastValue
        else:
            lastValue = row[column]
            matrix[rowCounter][x] = row[column]
        rowCounter += 1

matrix = matrix[1:]

### Assignment Question 2.1b

In [None]:
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, Binarizer

## Build the dataframe from the matrix where missing data has been filled

dataframe = spark.createDataFrame(pd.DataFrame(matrix, columns = schemaNames))
catIndexes = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 29]
others = [0, 1, 2, 3, 4, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34]
categoricalIndexes = np.array(schemaNames)[catIndexes]
otherIndexes = np.array(schemaNames)[others]
outputsIndexer = [str(value + '_ind') for value in categoricalIndexes]
outputsEncoder = [str(value + '_enc') for value in categoricalIndexes]

## Endcode all catagorical variables with one hot encoder, first requires string indexer to turn strings into integers

indexers = [StringIndexer(inputCol = column, outputCol = column + '_ind', handleInvalid =  'skip').fit(dataframe) for column in categoricalIndexes]
encoder = OneHotEncoderEstimator(inputCols = outputsIndexer, outputCols = outputsEncoder)
indexers.append(encoder)
encPipe = Pipeline(stages = indexers)
encodedData = encPipe.fit(dataframe).transform(dataframe)

requiredColumns = np.concatenate([otherIndexes, np.array(outputsEncoder)])
requiredColumns = requiredColumns.tolist()

encodedDataFrame = encodedData.select(requiredColumns)

### Assignment Question 2.1c

In [None]:
from pyspark.sql import functions as func

## Add a weights column to counter the highly unbalanced data, we want data that has claim information to be weighted highly
## As there are few entries with information about claims

hasClaimCount = encodedDataFrame.where(encodedDataFrame['Claim_Amount'] != 0.0).count()
weightRatio = hasClaimCount / encodedDataFrame.count()

encodedDataFrame = encodedDataFrame.withColumn('weights', func.when(encodedDataFrame['Claim_Amount'] == 0.0, weightRatio).otherwise(1.0 - weightRatio))    

### Assignment Question 2.2a

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import DoubleType

typeChangeDF = encodedDataFrame.withColumn('OrdCat', encodedDataFrame['OrdCat'].cast(DoubleType()))
## Need to re-run above two cells before this one
inputColumns = requiredColumns + ['weights']
inputColumns.remove('Claim_Amount')
inputColumns.remove('Row_ID')
inputColumns.remove('Household_ID')
assembler = VectorAssembler(inputCols = inputColumns, outputCol = 'features')
transformedDataFrame = assembler.transform(typeChangeDF)
binariserP2 = Binarizer(threshold = weightRatio + 0.0001, inputCol = 'Claim_Amount', outputCol = 'label')
binarizedDF = binariserP2.transform(transformedDataFrame)

trainingData, testData = binarizedDF.randomSplit([0.75, 0.25], 9)

In [None]:
from pyspark.ml.regression import GeneralizedLinearRegression
lmFullStart = time.time()

lm = GeneralizedLinearRegression(featuresCol = 'features', labelCol = 'Claim_Amount', weightCol = 'weights', maxIter = 10, regParam = 0.01, family = 'poisson', link = 'log')
lmmodel = lm.fit(trainingData)
lmPreds = lmmodel.transform(testData)

lmFullStop = time.time()
print(lmFullStop - lmFullStart)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

lmEvaluator = RegressionEvaluator(labelCol = 'Claim_Amount', predictionCol = 'prediction', metricName = 'rmse')
lmRmse = lmEvaluator.evaluate(lmPreds)
print("RMSE = %g " % lmRmse)

### Assignment Question 2.2b

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=100, regParam=0.9)
pipelineLR = Pipeline(stages=[lr])
modelLR = pipelinelR.fit(trainingData)
print('Logistic those who claimed:')
print('')
modelLR.transform(testData).show(100)

In [None]:
from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(family="gamma", link="identity", maxIter=100, regParam=0.001)
modelGLR = glr.fit(trainingData)