In [14]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder.appName('Bank').getOrCreate()

## Importación de datos

In [3]:
df = spark.read.csv('bank.csv',header=True,inferSchema=True)

In [4]:
df.limit(5).toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [5]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('deposit', 'string')]

In [6]:
df.describe().toPandas()

Unnamed: 0,summary,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,count,11162.0,11162,11162,11162,11162,11162.0,11162,11162,11162,11162.0,11162,11162.0,11162.0,11162.0,11162.0,11162,11162
1,mean,41.2319476796273,,,,,1528.5385235620856,,,,15.658036194230425,,371.9938183121304,2.508421429851281,51.33040673714388,0.8325568894463358,,
2,stddev,11.913369192215518,,,,,3225.413325946149,,,,8.420739541006462,,347.12838571630687,2.7220771816614824,108.75828197197715,2.292007218670508,,
3,min,18.0,admin.,divorced,primary,no,-6847.0,no,no,cellular,1.0,apr,2.0,1.0,-1.0,0.0,failure,no
4,max,95.0,unknown,single,unknown,yes,81204.0,yes,yes,unknown,31.0,sep,3881.0,63.0,854.0,58.0,unknown,yes


In [7]:
df.groupBy("deposit").count().toPandas()            # Balanceo de data

Unnamed: 0,deposit,count
0,no,5873
1,yes,5289


In [8]:
# Verificar Nulls
for col in df.columns:
    cant = df.filter(df[col].isNull()).count()
    print(col,cant)

age 0
job 0
marital 0
education 0
default 0
balance 0
housing 0
loan 0
contact 0
day 0
month 0
duration 0
campaign 0
pdays 0
previous 0
poutcome 0
deposit 0


## Feature Extraction - PySpark

In [9]:
var_cat = [nC for nC,dt in df.dtypes if dt =='string']
var_num = [nC for nC,dt in df.dtypes if dt in ['int','double']]

In [10]:
df.select(var_cat).toPandas()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,deposit
0,admin.,married,secondary,no,yes,no,unknown,may,unknown,yes
1,admin.,married,secondary,no,no,no,unknown,may,unknown,yes
2,technician,married,secondary,no,yes,no,unknown,may,unknown,yes
3,services,married,secondary,no,yes,no,unknown,may,unknown,yes
4,admin.,married,tertiary,no,no,no,unknown,may,unknown,yes
...,...,...,...,...,...,...,...,...,...,...
11157,blue-collar,single,primary,no,yes,no,cellular,apr,unknown,no
11158,services,married,secondary,no,no,no,unknown,jun,unknown,no
11159,technician,single,secondary,no,no,no,cellular,aug,unknown,no
11160,technician,married,secondary,no,no,yes,cellular,may,failure,no


In [11]:
df.select(var_num).toPandas()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,59,2343,5,1042,1,-1,0
1,56,45,5,1467,1,-1,0
2,41,1270,5,1389,1,-1,0
3,55,2476,5,579,1,-1,0
4,54,184,5,673,2,-1,0
...,...,...,...,...,...,...,...
11157,33,1,20,257,1,-1,0
11158,39,733,16,83,4,-1,0
11159,32,29,19,156,2,-1,0
11160,43,0,8,9,2,172,5


In [13]:
var_cat.remove('deposit')

## Tratamiento a las variables categóricas

In [15]:
lista_etapas = []

for cat in var_cat:
    
    strIdx = StringIndexer(inputCol=cat, outputCol=cat+'_index')
    encoder = OneHotEncoder(inputCol=cat+'_index',outputCol=cat+'_oneHot')
    lista_etapas += [strIdx,encoder]

## Tratamiento a la variable cat Y (deposit)

In [16]:
strIdx2 = StringIndexer(inputCol='deposit',outputCol='Y')
lista_etapas.append(strIdx2)

## Uniendo los vectores one-hot de las variables cat + variables num

In [17]:
columnasVectores = [c+'_oneHot' for c in var_cat] + var_num

ensamblador = VectorAssembler(inputCols=columnasVectores,outputCol='X')

lista_etapas.append(ensamblador)

## Normalizando todas las variables del vector assembler

In [30]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='X',outputCol='X_scaled')
lista_etapas.append(scaler)

In [31]:
lista_etapas

[StringIndexer_400caeb9a825,
 OneHotEncoder_5d481e71524a,
 StringIndexer_380b2a3ff179,
 OneHotEncoder_847dd34c2f6d,
 StringIndexer_5cf5c7b823f4,
 OneHotEncoder_3d5091cf65e2,
 StringIndexer_b1cdbd306352,
 OneHotEncoder_6087c94093f5,
 StringIndexer_c23d36ecac60,
 OneHotEncoder_77bfc054c712,
 StringIndexer_c63d925211da,
 OneHotEncoder_fd2e9b7d232a,
 StringIndexer_7a8432fd94ce,
 OneHotEncoder_91fd94e3f5fe,
 StringIndexer_02fd00739bba,
 OneHotEncoder_260237e119b2,
 StringIndexer_c2350cb399cc,
 OneHotEncoder_c3e5ffca1640,
 StringIndexer_af69eccd8209,
 VectorAssembler_a90a4d70871b,
 StandardScaler_498ce705a718]

## Aplicando las etapas al dataset - Pipeline

In [32]:
procesadorEtapas = Pipeline(stages=lista_etapas)

In [33]:
modelo = procesadorEtapas.fit(df)

In [34]:
df2 = modelo.transform(df)

In [36]:
df2.select(['X','X_scaled','Y']).limit(5).toPandas()

Unnamed: 0,X,X_scaled,Y
0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 3.082563803439365, 0.0, 0.0, 0...",1.0
1,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 3.082563803439365, 0.0, 0.0, 0...",1.0
2,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.7050733208556466, 0.0, 0.0, 0.0, ...",1.0
3,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 3.630721534877045, 0.0, 0...",1.0
4,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 3.082563803439365, 0.0, 0.0, 0...",1.0


In [23]:
df2.select(['X_scaled']).toPandas().values

array([[SparseVector(30, {2: 2.1852, 7: 2.1056, 8: 1.9999, 11: 2.1037, 12: 2.0144, 14: 2.0, 16: 2.0416, 18: 2.045, 20: 2.0104, 22: 2.0352, 23: 2.1165, 27: 0.0407, 28: 0.9922, 29: 0.0132})],
       [SparseVector(30, {0: 1.9999, 1: 2.0011, 2: 2.1852, 3: 3.3833, 4: 2.0012, 7: 2.1056, 9: 2.2115, 10: 2.0149, 13: 2.1051, 14: 2.0, 16: 2.0416, 18: 2.045, 24: 2.383, 27: 1.3852, 28: 1.8929, 29: 0.8336})],
       [SparseVector(30, {0: 1.9999, 1: 2.0011, 2: 2.1852, 3: 3.3833, 4: 2.0012, 7: 2.1056, 9: 2.2115, 11: 2.1037, 12: 2.0144, 14: 2.0, 16: 2.0416, 18: 2.045, 20: 2.0104, 22: 2.0352, 24: 2.383, 27: 0.0815, 28: 1.7899, 29: 0.0477})],
       ...,
       [SparseVector(30, {7: 2.1056, 9: 2.2115, 10: 2.0149, 12: 2.0144, 14: 2.0, 16: 2.0416, 18: 2.045, 20: 2.0104, 22: 2.0352, 23: 2.1165, 27: 0.4482, 28: 0.9838, 29: 0.1528})],
       [SparseVector(30, {0: 1.9999, 2: 2.1852, 3: 3.3833, 5: 2.0247, 6: 2.0143, 8: 1.9999, 10: 2.0149, 12: 2.0144, 14: 2.0, 16: 2.0416, 18: 2.045, 20: 2.0104, 22: 2.0352, 24: 2

# <center> Machine Learning

# ML (Logistic Regression)

#### Train-Test-Split

In [37]:
train, test = df2.randomSplit([0.8, 0.2], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 8911
Test Dataset Count: 2251


In [24]:
from pyspark.ml.classification import LogisticRegression

In [38]:
lr = LogisticRegression(featuresCol = 'X_scaled', labelCol = 'Y')
lrModel = lr.fit(train)

In [39]:
predictions = lrModel.transform(test)

In [43]:
predictions.select(['X_scaled','rawPrediction','probability','Y','prediction']).limit(5).toPandas()

Unnamed: 0,X_scaled,rawPrediction,probability,Y,prediction
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.66003642...","[-2.221610736165099, 2.221610736165099]","[0.09782655421160143, 0.9021734457883985]",1.0,1.0
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.66003642...","[0.39299943930154685, -0.39299943930154685]","[0.5970045442397476, 0.4029954557602524]",1.0,0.0
2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.66003642...","[0.11816605608443831, -0.11816605608443831]","[0.5295071874042133, 0.4704928125957867]",1.0,0.0
3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.66003642...","[-2.657320835838746, 2.657320835838746]","[0.06553922484671562, 0.9344607751532844]",1.0,1.0
4,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.66003642...","[-2.1183873400961253, 2.1183873400961253]","[0.10732247252620353, 0.8926775274737965]",1.0,1.0


In [41]:
#print(lrModel.explainParams())

In [44]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.900453889494815
