In [33]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import classification_report

In [2]:
spark = SparkSession.builder.appName('Diabetes').getOrCreate()

In [3]:
spark

## 1.- Leer Datos

In [4]:
df = spark.read.csv('diabetes.csv',header=True, inferSchema=True)

In [5]:
df.dtypes          # Tipo de datos correcto

[('Pregnancies', 'int'),
 ('Glucose', 'int'),
 ('BloodPressure', 'int'),
 ('SkinThickness', 'int'),
 ('Insulin', 'int'),
 ('BMI', 'double'),
 ('DiabetesPedigreeFunction', 'double'),
 ('Age', 'int'),
 ('Outcome', 'int')]

In [6]:
df.toPandas()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## 2.- Preprocesamiento

In [7]:
var_num = [nC for nC,dt in df.dtypes if dt in ['int','double']]

In [8]:
var_num

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [9]:
var_num.remove('Outcome')

In [10]:
# Remover Nulls
df = df.na.drop()

In [11]:
df.count()

768

### Tratamiento de variables

In [12]:
lista_etapas = []
strIdx = StringIndexer(inputCol='Outcome',outputCol='Y')
lista_etapas.append(strIdx)

In [13]:
columnasVectores =  var_num

ensamblador = VectorAssembler(inputCols=columnasVectores,outputCol='X')

lista_etapas.append(ensamblador)

### Normalización

In [14]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='X',outputCol='X_scaled')
lista_etapas.append(scaler)

### Aplicando las etapas al dataset - Pipeline

In [15]:
procesadorEtapas = Pipeline(stages=lista_etapas)

In [16]:
modelo = procesadorEtapas.fit(df)

In [17]:
df2 = modelo.transform(df)

In [18]:
df2.select(['X_scaled','Y']).limit(5).toPandas()

Unnamed: 0,X_scaled,Y
0,"[1.7806383732194306, 4.628960915766174, 3.7198...",1.0
1,"[0.29677306220323846, 2.658524850271114, 3.409...",0.0
2,"[2.3741844976259077, 5.723647618818986, 3.3065...",1.0
3,"[0.29677306220323846, 2.783631902048578, 3.409...",0.0
4,"[0.0, 4.284916523378148, 2.0665632617307947, 2...",1.0


In [19]:
df2.select(['X_scaled']).toPandas().values[0][0]

DenseVector([1.7806, 4.629, 3.7198, 2.1941, 0.0, 4.2617, 1.8924, 4.2516])

## 3.- Machine Learning

### 3.1. Logistic Regression

In [20]:
from pyspark.ml.classification import LogisticRegression

### a) 80 y 20

In [22]:
train, test = df2.randomSplit([0.8, 0.2], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

lr = LogisticRegression(featuresCol = 'X_scaled', labelCol = 'Y')
lrModel = lr.fit(train)

predictions = lrModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 604
Test Dataset Count: 164


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [23]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.8902418682235199


In [31]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.83      0.94      0.88       109
         1.0       0.83      0.62      0.71        55

    accuracy                           0.83       164
   macro avg       0.83      0.78      0.79       164
weighted avg       0.83      0.83      0.82       164



### b) 70 y 30

In [34]:
train, test = df2.randomSplit([0.7, 0.3], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

lr = LogisticRegression(featuresCol = 'X_scaled', labelCol = 'Y')
lrModel = lr.fit(train)

predictions = lrModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 524
Test Dataset Count: 244


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [35]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.876638114063987


In [36]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.80      0.89      0.84       157
         1.0       0.75      0.59      0.66        87

    accuracy                           0.78       244
   macro avg       0.77      0.74      0.75       244
weighted avg       0.78      0.78      0.78       244



### 3.2. Support Vector Machine (SVM)

In [53]:
from pyspark.ml.classification import LinearSVC

### a) 80 y 20

In [54]:
train, test = df2.randomSplit([0.8, 0.2], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

svm = LinearSVC(featuresCol = 'X_scaled', labelCol = 'Y')
svmModel = svm.fit(train)

predictions = svmModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 604
Test Dataset Count: 164


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [55]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.8845704753961636


In [56]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.81      0.93      0.87       109
         1.0       0.80      0.58      0.67        55

    accuracy                           0.81       164
   macro avg       0.81      0.75      0.77       164
weighted avg       0.81      0.81      0.80       164



### b) 70 y 30

In [57]:
train, test = df2.randomSplit([0.7, 0.3], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

svm = LinearSVC(featuresCol = 'X_scaled', labelCol = 'Y')
svmModel = svm.fit(train)

predictions = svmModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 524
Test Dataset Count: 244


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [58]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.8672670034409543


In [59]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.80      0.90      0.84       157
         1.0       0.76      0.59      0.66        87

    accuracy                           0.79       244
   macro avg       0.78      0.74      0.75       244
weighted avg       0.78      0.79      0.78       244



### 3.3. Naive Bayes

In [60]:
from pyspark.ml.classification import NaiveBayes

### a) 80 y 20

In [61]:
train, test = df2.randomSplit([0.8, 0.2], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

nb = NaiveBayes(featuresCol = 'X_scaled', labelCol = 'Y')
nbModel = nb.fit(train)

predictions = nbModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 604
Test Dataset Count: 164


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [62]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.21968306922435363


In [63]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.68      0.99      0.81       109
         1.0       0.83      0.09      0.16        55

    accuracy                           0.69       164
   macro avg       0.76      0.54      0.49       164
weighted avg       0.73      0.69      0.59       164



### b) 70 y 30

In [64]:
train, test = df2.randomSplit([0.7, 0.3], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

nb = NaiveBayes(featuresCol = 'X_scaled', labelCol = 'Y')
nbModel = nb.fit(train)

predictions = nbModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 524
Test Dataset Count: 244


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [65]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.23047075188520394


In [66]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.65      0.99      0.78       157
         1.0       0.67      0.02      0.04        87

    accuracy                           0.65       244
   macro avg       0.66      0.51      0.41       244
weighted avg       0.65      0.65      0.52       244



### 3.4. Random Forest

In [37]:
from pyspark.ml.classification import RandomForestClassifier

### a) 80 y 20  -  10 árboles

In [38]:
train, test = df2.randomSplit([0.8, 0.2], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

rf = RandomForestClassifier(featuresCol = 'X_scaled', labelCol = 'Y',numTrees=10)
rfModel = rf.fit(train)

predictions = rfModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 604
Test Dataset Count: 164


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [39]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.8627189324437032


In [40]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.82      0.85      0.84       109
         1.0       0.69      0.64      0.66        55

    accuracy                           0.78       164
   macro avg       0.75      0.74      0.75       164
weighted avg       0.78      0.78      0.78       164



### b) 80 y 20  -  20 árboles

In [41]:
train, test = df2.randomSplit([0.8, 0.2], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

rf = RandomForestClassifier(featuresCol = 'X_scaled', labelCol = 'Y',numTrees=20)
rfModel = rf.fit(train)

predictions = rfModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 604
Test Dataset Count: 164


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [42]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.8830692243536281


In [43]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.84      0.86      0.85       109
         1.0       0.71      0.67      0.69        55

    accuracy                           0.80       164
   macro avg       0.78      0.77      0.77       164
weighted avg       0.80      0.80      0.80       164



### c) 70 y 30  -  10 árboles

In [47]:
train, test = df2.randomSplit([0.7, 0.3], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

rf = RandomForestClassifier(featuresCol = 'X_scaled', labelCol = 'Y',numTrees=10)
rfModel = rf.fit(train)

predictions = rfModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 524
Test Dataset Count: 244


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [48]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.8592869170510286


In [49]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.81      0.85      0.83       157
         1.0       0.70      0.63      0.66        87

    accuracy                           0.77       244
   macro avg       0.75      0.74      0.74       244
weighted avg       0.77      0.77      0.77       244



### d) 70 y 30  -  20 árboles

In [50]:
train, test = df2.randomSplit([0.7, 0.3], seed = 10)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

rf = RandomForestClassifier(featuresCol = 'X_scaled', labelCol = 'Y',numTrees=20)
rfModel = rf.fit(train)

predictions = rfModel.transform(test)

predictions.select(['Y','prediction']).limit(5).toPandas()

Training Dataset Count: 524
Test Dataset Count: 244


Unnamed: 0,Y,prediction
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [51]:
evaluator = BinaryClassificationEvaluator(labelCol='Y')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.8652170729921659


In [52]:
print(classification_report(predictions.select(['Y']).toPandas().values,predictions.select(['prediction']).toPandas().values))

              precision    recall  f1-score   support

         0.0       0.80      0.87      0.83       157
         1.0       0.71      0.60      0.65        87

    accuracy                           0.77       244
   macro avg       0.75      0.73      0.74       244
weighted avg       0.77      0.77      0.77       244

