In [1]:





from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder.appName('BigData').getOrCreate()

In [3]:
spark

# <center> Resumiendo y normalizando con StandardScaler

## Importando Datos con PySpark

In [4]:
df = spark.read.csv('telco_customer_churn.csv',
                    header=True, 
                    inferSchema=True)

In [5]:
var_cat = [nC for nC,dt in df.dtypes if dt =='string']
var_num = [nC for nC,dt in df.dtypes if dt in ['int','double']]

In [6]:
var_cat.remove('customerID')
var_cat.remove('Churn')
var_cat.remove('TotalCharges')

In [7]:
#Cambiando la variable TotalCharges a double
df = df.withColumn('TotalCharges', df['TotalCharges'].cast('double'))

In [8]:
df.filter(df['TotalCharges'].isNull()).count()

11

In [None]:
# Verificar Nulls
for col in df.columns:
    cant = df.filter(df['TotalCharges'].isNull()).count()
    print(col,cant)

In [9]:
df.count()

7043

In [9]:
#Remover NA
df = df.na.drop()

In [11]:
df.count()

7032

In [10]:
var_num.append('TotalCharges')
var_num

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [11]:
var_cat

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

### Tratamiento a las variables categóricas

In [12]:
lista_etapas = []

for cat in var_cat:
    
    strIdx = StringIndexer(inputCol=cat, outputCol=cat+'_index')
    encoder = OneHotEncoder(inputCol=cat+'_index',outputCol=cat+'_oneHot')
    lista_etapas += [strIdx,encoder]

### Tratamiento a la variable cat Y (Churn)

In [13]:
strIdx2 = StringIndexer(inputCol='Churn',outputCol='Y')
lista_etapas.append(strIdx2)

### Uniendo los vectores one-hot de las variables cat + variables num

In [14]:
columnasVectores = [c+'_oneHot' for c in var_cat] + var_num

ensamblador = VectorAssembler(inputCols=columnasVectores,outputCol='X')

lista_etapas.append(ensamblador)

### Normalizando todas las variables del vector assembler

In [15]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='X',outputCol='X_scaled')
lista_etapas.append(scaler)

In [16]:
lista_etapas

[StringIndexer_c40d17582262,
 OneHotEncoder_77e841302c47,
 StringIndexer_260464fdc83f,
 OneHotEncoder_3274f4dfd1c1,
 StringIndexer_63b8a9cee218,
 OneHotEncoder_94191060b518,
 StringIndexer_c058ff025585,
 OneHotEncoder_48d34774c573,
 StringIndexer_226b8c88dbc2,
 OneHotEncoder_c6bcd2c7d0c0,
 StringIndexer_8f22d3867922,
 OneHotEncoder_abae2342cb48,
 StringIndexer_19717e2cbc49,
 OneHotEncoder_9cdff7c1268d,
 StringIndexer_5ac21f087cbb,
 OneHotEncoder_46b4ecb5b2b1,
 StringIndexer_b17e566c9b7f,
 OneHotEncoder_e5065dbdc5d7,
 StringIndexer_dee85308c8e0,
 OneHotEncoder_962d6b3c2fdc,
 StringIndexer_0c0c207081c6,
 OneHotEncoder_dfad41c16e83,
 StringIndexer_3e7bfb1fec97,
 OneHotEncoder_cd942f7b39b1,
 StringIndexer_7a44eff67c29,
 OneHotEncoder_cfedafb2ef61,
 StringIndexer_a698b7ec40fd,
 OneHotEncoder_4d66281f4e67,
 StringIndexer_fd1dae86f8fa,
 OneHotEncoder_74a0c01e4b96,
 StringIndexer_9fab1281f99e,
 VectorAssembler_b977f23cf6fd,
 StandardScaler_cbeeb05e6a04]

### Aplicando las etapas al dataset - Pipeline

In [17]:
procesadorEtapas = Pipeline(stages=lista_etapas)

In [18]:
modelo = procesadorEtapas.fit(df)

In [19]:
df2 = modelo.transform(df)

In [20]:
df2.select(['X_scaled','Y']).limit(5).toPandas()

Unnamed: 0,X_scaled,Y
0,"(0.0, 0.0, 2.1851748981300347, 0.0, 0.0, 0.0, ...",0.0
1,"(1.9999458781640924, 2.001082630964643, 2.1851...",0.0
2,"(1.9999458781640924, 2.001082630964643, 2.1851...",1.0
3,"(1.9999458781640924, 2.001082630964643, 2.1851...",0.0
4,"(0.0, 2.001082630964643, 2.1851748981300347, 3...",1.0


In [23]:
df2.select(['X_scaled']).toPandas().values

array([[SparseVector(30, {2: 2.1852, 7: 2.1056, 8: 1.9999, 11: 2.1037, 12: 2.0144, 14: 2.0, 16: 2.0416, 18: 2.045, 20: 2.0104, 22: 2.0352, 23: 2.1165, 27: 0.0407, 28: 0.9922, 29: 0.0132})],
       [SparseVector(30, {0: 1.9999, 1: 2.0011, 2: 2.1852, 3: 3.3833, 4: 2.0012, 7: 2.1056, 9: 2.2115, 10: 2.0149, 13: 2.1051, 14: 2.0, 16: 2.0416, 18: 2.045, 24: 2.383, 27: 1.3852, 28: 1.8929, 29: 0.8336})],
       [SparseVector(30, {0: 1.9999, 1: 2.0011, 2: 2.1852, 3: 3.3833, 4: 2.0012, 7: 2.1056, 9: 2.2115, 11: 2.1037, 12: 2.0144, 14: 2.0, 16: 2.0416, 18: 2.045, 20: 2.0104, 22: 2.0352, 24: 2.383, 27: 0.0815, 28: 1.7899, 29: 0.0477})],
       ...,
       [SparseVector(30, {7: 2.1056, 9: 2.2115, 10: 2.0149, 12: 2.0144, 14: 2.0, 16: 2.0416, 18: 2.045, 20: 2.0104, 22: 2.0352, 23: 2.1165, 27: 0.4482, 28: 0.9838, 29: 0.1528})],
       [SparseVector(30, {0: 1.9999, 2: 2.1852, 3: 3.3833, 5: 2.0247, 6: 2.0143, 8: 1.9999, 10: 2.0149, 12: 2.0144, 14: 2.0, 16: 2.0416, 18: 2.045, 20: 2.0104, 22: 2.0352, 24: 2

# <center> Machine Learning

# ML (Logistic Regression)

#### Train-Test-Split

In [67]:
train, test = df2.randomSplit([0.8, 0.2], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 5628
Test Dataset Count: 1404


In [68]:
from pyspark.ml.classification import LogisticRegression

In [69]:
lr = LogisticRegression(featuresCol = 'X_scaled', labelCol = 'Y')
lrModel = lr.fit(train)

In [70]:
predictions = lrModel.transform(test)

In [71]:
predictions.select(['customerID','rawPrediction','probability','Y','prediction']).limit(5).toPandas()

Unnamed: 0,customerID,rawPrediction,probability,Y,prediction
0,0003-MKNFE,"[0.6846107139763482, -0.6846107139763482]","[0.6647669828639995, 0.3352330171360005]",0.0,0.0
1,0011-IGKFF,"[-0.9121565436709589, 0.9121565436709589]","[0.28655874449363083, 0.7134412555063692]",1.0,1.0
2,0013-SMEOE,"[2.8875957665894108, -2.8875957665894108]","[0.9472298339525708, 0.052770166047429234]",0.0,0.0
3,0017-DINOC,"[4.607000247694261, -4.607000247694261]","[0.990116933842407, 0.009883066157593046]",0.0,0.0
4,0019-EFAEP,"[3.011427647321358, -3.011427647321358]","[0.9530877278961264, 0.04691227210387361]",0.0,0.0


In [72]:
#print(lrModel.explainParams())

In [73]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.8428829717291296


## KNN - tarea

## Kmeans

In [35]:
from pyspark.ml.clustering import KMeans

In [36]:
modelo = KMeans(k = 2,featuresCol='X_scaled',predictionCol='Outputs')

In [None]:
df2.count()

In [37]:
modelo = modelo.fit(df2)

In [38]:
preds = modelo.transform(df2)

In [39]:
preds.select(['X_scaled','Y','Outputs']).toPandas()

Unnamed: 0,X_scaled,Y,Outputs
0,"(0.0, 0.0, 2.1851748981300347, 0.0, 0.0, 0.0, ...",0.0,1
1,"(1.9999458781640924, 2.001082630964643, 2.1851...",0.0,1
2,"(1.9999458781640924, 2.001082630964643, 2.1851...",1.0,1
3,"(1.9999458781640924, 2.001082630964643, 2.1851...",0.0,1
4,"(0.0, 2.001082630964643, 2.1851748981300347, 3...",1.0,1
...,...,...,...
7027,"(1.9999458781640924, 0.0, 0.0, 3.3832826290967...",0.0,0
7028,"(0.0, 0.0, 0.0, 3.3832826290967626, 0.0, 2.024...",0.0,0
7029,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.10555511...",0.0,1
7030,"(1.9999458781640924, 0.0, 2.1851748981300347, ...",1.0,1


In [40]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [41]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator(predictionCol='Outputs',featuresCol='X_scaled')

In [42]:
silhouette = evaluator.evaluate(preds)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.24929932328548837


In [43]:
# Shows the result.
centers = modelo.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[1.00392228 0.65833406 1.45160513 3.11604072 0.42285439 1.43695546
 1.3022817  0.74426218 0.98019737 1.1275893  0.68357414 1.39000141
 0.57600312 1.50319652 0.89811073 1.21396486 0.46281749 1.5895718
 0.4579534  1.59221335 0.63915621 0.88910412 1.43393269 0.69462567
 0.1694083  0.75211134 0.58784919 2.10415046 3.0170633  2.06304207]
[1.01241705 1.24778619 1.57866747 3.02239915 1.26745663 0.52641988
 0.65307452 0.71167763 1.00261001 0.35580697 0.9976055  0.35155099
 1.06092773 0.28536346 1.03778525 0.31582231 1.01397345 0.34031017
 1.00615997 0.34923573 1.37151543 0.3768938  1.07818021 0.72148342
 0.7540903  0.4049286  0.35727239 0.88018806 1.66802053 0.41325648]


## Ejercicios:

    1. A partir de los sparce vectors regenerar el vector con toda la data

In [61]:
listSpareVec = df2.select(['X_scaled']).toPandas().values

In [62]:
import numpy as np

In [63]:
listOriginVec=[]

for vector in listSpareVec:
    listOriginVec.append(np.array(vector[0]))

In [65]:
listOriginVec[:4]

[array([0.        , 0.        , 2.1851749 , 0.        , 0.        ,
        0.        , 0.        , 2.10555512, 1.99988699, 0.        ,
        0.        , 2.1036961 , 2.01441974, 0.        , 2.0000144 ,
        0.        , 2.04155728, 0.        , 2.04504059, 0.        ,
        2.01036463, 0.        , 2.03515559, 2.11647677, 0.        ,
        0.        , 0.        , 0.04074106, 0.99215668, 0.01316851]),
 array([1.99994588, 2.00108263, 2.1851749 , 3.38328263, 2.00124731,
        0.        , 0.        , 2.10555512, 0.        , 2.21150744,
        2.01491236, 0.        , 0.        , 2.10514026, 2.0000144 ,
        0.        , 2.04155728, 0.        , 2.04504059, 0.        ,
        0.        , 0.        , 0.        , 0.        , 2.38301008,
        0.        , 0.        , 1.38519618, 1.89290864, 0.83356444]),
 array([1.99994588, 2.00108263, 2.1851749 , 3.38328263, 2.00124731,
        0.        , 0.        , 2.10555512, 0.        , 2.21150744,
        0.        , 2.1036961 , 2.01441974, 

    2. Calcular accuracy en el clasificador logistic regression (sklearn.metrics.accuracy_score)

In [93]:
Y_real = predictions.select(['Y']).toPandas().values
Y_predicha = predictions.select(['prediction']).toPandas().values

In [94]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [95]:
accuracy_score(Y_real,Y_predicha)

0.7955840455840456