### Nombre: Eder Vilcacure
### Examen Final BDA - Bloque 2 - Pregunta2

Una empresa del sector bancario se encuentra realizando un estudio a sus datos para dirigir correctamente sus nuevas campañas de marketing. Para una muestra de clientes la empresa conoce distintos datos/variables y se desea predecir si un cliente se suscribirá a un depósito específico. (dataset en adjunto en la plataforma).

Usando las herramientas de pyspark se desea procesar toda la data y construir por lo menos 2 clasificadores distintos. Su código debe contener todas las etapas vistas en clase: Lectura del dataset, Feature extraction, Pipeline, Entrenamiento de los modelos y sus respectivas medidas de AUC.

In [13]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [2]:
spark = SparkSession.builder.appName('PySpark - Machine Learning').getOrCreate()
spark

### Lectura del dataset

In [3]:
file = "bank.csv"

In [4]:
df = spark.read.csv(file,sep=",",header=True,inferSchema=True)

In [5]:
type(df)

pyspark.sql.dataframe.DataFrame

In [24]:
cols_ini = df.columns
cols_ini

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'deposit',
 'jobIndex',
 'jobclassVec',
 'maritalIndex',
 'maritalclassVec',
 'educationIndex',
 'educationclassVec',
 'defaultIndex',
 'defaultclassVec',
 'housingIndex',
 'housingclassVec',
 'loanIndex',
 'loanclassVec',
 'contactIndex',
 'contactclassVec',
 'monthIndex',
 'monthclassVec',
 'poutcomeIndex',
 'poutcomeclassVec',
 'label',
 'features']

### Feature Extraction

In [10]:
df.limit(5).toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [7]:
var_categ = [c for c,i in df.dtypes if i in ['string','bool']]

In [8]:
df.select(var_categ).describe().toPandas()

Unnamed: 0,summary,job,marital,education,default,housing,loan,contact,month,poutcome,deposit
0,count,11162,11162,11162,11162,11162,11162,11162,11162,11162,11162
1,mean,,,,,,,,,,
2,stddev,,,,,,,,,,
3,min,admin.,divorced,primary,no,no,no,cellular,apr,failure,no
4,max,unknown,single,unknown,yes,yes,yes,unknown,sep,unknown,yes


In [11]:
var_numeric = [c for c,i in df.dtypes if i in ['int','double']]

In [12]:
df.select(var_numeric).describe().toPandas()

Unnamed: 0,summary,age,balance,day,duration,campaign,pdays,previous
0,count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
1,mean,41.2319476796273,1528.5385235620856,15.658036194230425,371.9938183121304,2.508421429851281,51.33040673714388,0.8325568894463358
2,stddev,11.913369192215518,3225.413325946149,8.420739541006462,347.12838571630687,2.7220771816614824,108.75828197197715,2.292007218670508
3,min,18.0,-6847.0,1.0,2.0,1.0,-1.0,0.0
4,max,95.0,81204.0,31.0,3881.0,63.0,854.0,58.0


In [15]:
#Para variables categóricas
etapas = []

var_categ.remove('deposit')
for categoricalCol in var_categ:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    etapas += [stringIndexer, encoder]

In [16]:
label_stringIdx = StringIndexer(inputCol = 'deposit', outputCol = 'label')
etapas += [label_stringIdx]

In [17]:
etapas[-1].getOutputCol()

'label'

In [18]:
assemblerInputs = [c + "classVec" for c in var_categ] + var_numeric
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
etapas += [assembler]

### Pipeline

In [19]:
from pyspark.ml import Pipeline

In [21]:
pipeline = Pipeline(stages = etapas)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)

In [22]:
df.limit(5).toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,loanIndex,loanclassVec,contactIndex,contactclassVec,monthIndex,monthclassVec,poutcomeIndex,poutcomeclassVec,label,features
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,...,0.0,(1.0),1.0,"(0.0, 1.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0)",1.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,56,admin.,married,secondary,no,45,no,no,unknown,5,...,0.0,(1.0),1.0,"(0.0, 1.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0)",1.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,...,0.0,(1.0),1.0,"(0.0, 1.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0)",1.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,55,services,married,secondary,no,2476,yes,no,unknown,5,...,0.0,(1.0),1.0,"(0.0, 1.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0)",1.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,...,0.0,(1.0),1.0,"(0.0, 1.0)",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0)",1.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [25]:
selectedCols = ['label', 'features'] +cols_ini
df = df.select(selectedCols)
df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)
 |-- jobIndex: double (nullable = false)
 |-- jobclassVec: vector (nullable = true)
 |-- maritalIndex: double (nullable = false)
 |-- maritalclassVec: vector (nullable = true)
 |-- educationIndex: double (nullable = false)
 |-- educationclassVec: vector (nulla

In [27]:
df.limit(5).toPandas().transpose()

Unnamed: 0,0,1,2,3,4
label,1,1,1,1,1
features,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
age,59,56,41,55,54
job,admin.,admin.,technician,services,admin.
marital,married,married,married,married,married
education,secondary,secondary,secondary,secondary,tertiary
default,no,no,no,no,no
balance,2343,45,1270,2476,184
housing,yes,no,yes,yes,no
loan,no,no,no,no,no


### Entrenamiento de los modelos

In [28]:
train, test = df.randomSplit([0.8, 0.2], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 8911
Test Dataset Count: 2251


In [31]:
from pyspark.ml.classification import LogisticRegression

In [33]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label')

In [34]:
lrModel = lr.fit(train)

AnalysisException: Reference 'label' is ambiguous, could be: label, label.

In [37]:
predictions = lrModel.transform(test)

NameError: name 'lrModel' is not defined

In [38]:
predictions.toPandas()

NameError: name 'predictions' is not defined

### MEdidas AUC

In [35]:
from sklearn.metrics import roc_auc_score

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))