### **Predicción:** ¿Cúantas veces será compartido un artículo del Blog?

### Paso 1: Cargar Dataset

In [0]:
import pandas as pd
from pyspark.sql.functions import *

In [0]:
# Ruta y tipo del archivo
file_location = "/FileStore/tables/articulos_ml.csv"
file_type = "csv"

# CSV opciones
infer_schema = "true"
first_row_is_header = "true" # primera fila como cabecera
delimiter = ","

# Leer la data
articulos = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(articulos)

Title,url,Word count,# of Links,# of comments,# Images video,Elapsed days,# Shares
What is Machine Learning and how do we use it in Signals?,https://blog.signals.network/what-is-machine-learning-and-how-do-we-use-it-in-signals-6797e720d636,1888,1,2.0,2,34,200000
10 Companies Using Machine Learning in Cool Ways,,1742,9,,9,5,25000
How Artificial Intelligence Is Revolutionizing the Sector,,962,6,0.0,1,10,42000
Dbrain and the Blockchain of Artificial Intelligence,,1221,3,,2,68,200000
Nasa finds entire solar system filled with eight planets like our own,,2039,1,104.0,4,131,200000
5 ways Data Science and Machine Learning impact business,,761,0,,1,14,21000
200 universities just launched 600 free online courses. Here’s the full list.,https://qz.com/1120344/200-universities-just-launched-600-free-online-courses-heres-the-full-list/,6462,600,28.0,2,170,200000
How Machine Learning can help Cryptocurrency Traders Maximize their Gains,https://cryptovest.com/news/how-machine-learning-can-help-cryptocurrency-traders-maximize-their-gains/?utm_source=SocialAnimal&utm_medium=referral,753,3,0.0,1,78,77000
Tech companies should stop pretending AI won’t destroy jobs,https://www.technologyreview.com/s/610298/tech-companies-should-stop-pretending-ai-wont-destroy-jobs/,1118,2,,1,62,59400
Artificial intelligence is going to completely change your life,,1581,4,,2,60,35000


#### Data Cleaning

In [0]:
# Verificamos el tipo de datos de la data
articulos.dtypes

Out[48]: [('Title', 'string'),
 ('url', 'string'),
 ('Word count', 'string'),
 ('# of Links', 'string'),
 ('# of comments', 'int'),
 ('# Images video', 'int'),
 ('Elapsed days', 'int'),
 ('# Shares', 'int')]

In [0]:
# Cambiar el tipo de dato de las columnas "Word count" y "# of Links" 
articulos = articulos.withColumn("Word count", 
                                  articulos["Word count"]
                                  .cast('int'))

articulos = articulos.withColumn("# of Links", 
                                  articulos["# of Links"]
                                  .cast('int'))
articulos.dtypes

Out[49]: [('Title', 'string'),
 ('url', 'string'),
 ('Word count', 'int'),
 ('# of Links', 'int'),
 ('# of comments', 'int'),
 ('# Images video', 'int'),
 ('Elapsed days', 'int'),
 ('# Shares', 'int')]

In [0]:
# Realizamos una descripción general
display(articulos.select("Word count","# of comments","# Images video","Elapsed days","# Shares").summary())

summary,Word count,# of comments,# Images video,Elapsed days,# Shares
count,160.0,129.0,161.0,161.0,161.0
mean,1812.40625,17.573643410852714,3.695652173913044,97.8944099378882,27916.35403726708
stddev,1144.2888445270803,100.89929812641464,3.4206057180360423,114.47601290687888,43426.80731031956
min,250.0,0.0,1.0,1.0,0.0
25%,988.0,2.0,1.0,30.0,2800.0
50%,1674.0,6.0,3.0,62.0,16458.0
75%,2369.0,12.0,5.0,124.0,35691.0
max,8401.0,1145.0,22.0,1002.0,350000.0


- La media de "Word count" es de 1812 palabras.
- Existe un artículo con un máximo de palabras de 8401 y un mínimo de 250.
- En la salida, un mínimo de 0 y un máximo de 350000.

In [0]:
# Contar los valores nulos por las columnas
articulos.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in articulos.columns]).show()

+-----+---+----------+----------+-------------+--------------+------------+--------+
|Title|url|Word count|# of Links|# of comments|# Images video|Elapsed days|# Shares|
+-----+---+----------+----------+-------------+--------------+------------+--------+
|    0| 39|         1|         1|           32|             0|           0|       0|
+-----+---+----------+----------+-------------+--------------+------------+--------+



#### Una manera de reemplazar los valores nulos es mediante el promedio en columnas numéricas

In [0]:
# En este caso lo manejaremos con Pandas
articulos_pd = articulos.toPandas() # Para manejar un Dataframe Spark como Pandas

In [0]:
# Reemplazar los valores NULL por el promedio

# Para la columna "# of comments"
articulos_pd['# of comments']=articulos_pd['# of comments'].fillna(articulos_pd['# of comments'].mean())
# Para la columna "# of Links"
articulos_pd['# of Links']=articulos_pd['# of Links'].fillna(articulos_pd['# of Links'].mean())
# Para la columna "Word count"
articulos_pd['Word count']=articulos_pd['Word count'].fillna(articulos_pd['Word count'].mean())

In [0]:
# Observamos que ya no hay valores NULL en las columnas numéricas
articulos_pd.isnull().any()

Out[54]: Title             False
url                True
Word count        False
# of Links        False
# of comments     False
# Images video    False
Elapsed days      False
# Shares          False
dtype: bool

In [0]:
# Cambiamos el tipo de dato de la columna "# of comments" a int
articulos_pd['# of comments']=articulos_pd['# of comments'].astype('int64')

### Visualización General

#### Analizamos la cantidad de palabras (Word count)

In [0]:
import plotly.express as px
px.histogram(articulos_pd['Word count'], x="Word count", title="Cantidad de Palabras",color_discrete_sequence=['indianred'],text_auto=True)

Podemos observar que donde se concentran mayormente los registros para **Word count** es de 3500.

#### Analizamos la cantidad de link (# of Links)

In [0]:
px.histogram(articulos_pd['# of Links'], x="# of Links", title="Cantidad de links",color_discrete_sequence=['blue'],text_auto=True)

Podemos observar que donde se concentran mayormente los registros para **# of comments** es de 50.

#### Analizamos la cantidad de comentarios (# of comments)

In [0]:
px.histogram(articulos_pd['# of comments'], x="# of comments", title="Cantidad de Comentarios",color_discrete_sequence=['green'],text_auto=True)

Podemos observar que donde se concentran mayormente los registros para **# of comments** es de 50.

#### Analizamos la cantidad de compartidos (# Shares)

In [0]:
px.histogram(articulos_pd['# Shares'], x="# Shares", title="Cantidad de Compartidos",color_discrete_sequence=['goldenrod'],text_auto=True)

Podemos observar que donde se concentra la mayor cantidad de registros para **# Shares** es en 80000

##### Filtramos en donde mayormente se centran los datos

In [0]:
articulos_pd = articulos_pd[(articulos_pd['Word count']<=3500) & (articulos_pd['# Shares']<=80000)]
display(articulos_pd)

Title,url,Word count,# of Links,# of comments,# Images video,Elapsed days,# Shares
10 Companies Using Machine Learning in Cool Ways,,1742.0,9.0,17,9,5,25000
How Artificial Intelligence Is Revolutionizing the Sector,,962.0,6.0,0,1,10,42000
5 ways Data Science and Machine Learning impact business,,761.0,0.0,17,1,14,21000
How Machine Learning can help Cryptocurrency Traders Maximize their Gains,https://cryptovest.com/news/how-machine-learning-can-help-cryptocurrency-traders-maximize-their-gains/?utm_source=SocialAnimal&utm_medium=referral,753.0,3.0,0,1,78,77000
Tech companies should stop pretending AI won’t destroy jobs,https://www.technologyreview.com/s/610298/tech-companies-should-stop-pretending-ai-wont-destroy-jobs/,1118.0,2.0,17,1,62,59400
Artificial intelligence is going to completely change your life,,1581.0,4.0,17,2,60,35000
Google CEO Sundar Pichai says AI is more profound than electricity or fire,,256.0,0.0,27,1,95,29000
How machine learning is changing the financial industry,,1267.0,2.0,17,4,124,37000
New AI can work out whether you're gay or straight from a photograph,https://www.theguardian.com/technology/2017/sep/07/new-artificial-intelligence-can-tell-whether-youre-gay-or-straight-from-a-photograph,971.0,10.0,17,1,228,67300
Researchers have linked a human brain to the Internet for the first time ever,https://futurism.com/researchers-have-linked-a-human-brain-to-the-internet-for-the-first-time-ever/,369.0,6.0,17,1,222,60100


##### Pasamos lo que hicimos en Pandas a Spark

In [0]:
spark.conf.set("spark.sql.execution.arrow.enabled","true")
articulos=spark.createDataFrame(articulos_pd)
display(articulos)

Title,url,Word count,# of Links,# of comments,# Images video,Elapsed days,# Shares
10 Companies Using Machine Learning in Cool Ways,,1742.0,9.0,17,9,5,25000
How Artificial Intelligence Is Revolutionizing the Sector,,962.0,6.0,0,1,10,42000
5 ways Data Science and Machine Learning impact business,,761.0,0.0,17,1,14,21000
How Machine Learning can help Cryptocurrency Traders Maximize their Gains,https://cryptovest.com/news/how-machine-learning-can-help-cryptocurrency-traders-maximize-their-gains/?utm_source=SocialAnimal&utm_medium=referral,753.0,3.0,0,1,78,77000
Tech companies should stop pretending AI won’t destroy jobs,https://www.technologyreview.com/s/610298/tech-companies-should-stop-pretending-ai-wont-destroy-jobs/,1118.0,2.0,17,1,62,59400
Artificial intelligence is going to completely change your life,,1581.0,4.0,17,2,60,35000
Google CEO Sundar Pichai says AI is more profound than electricity or fire,,256.0,0.0,27,1,95,29000
How machine learning is changing the financial industry,,1267.0,2.0,17,4,124,37000
New AI can work out whether you're gay or straight from a photograph,https://www.theguardian.com/technology/2017/sep/07/new-artificial-intelligence-can-tell-whether-youre-gay-or-straight-from-a-photograph,971.0,10.0,17,1,228,67300
Researchers have linked a human brain to the Internet for the first time ever,https://futurism.com/researchers-have-linked-a-human-brain-to-the-internet-for-the-first-time-ever/,369.0,6.0,17,1,222,60100


#### Dividir aleatoriamente los datos en conjuntos de entrenamiento(train) y prueba(test).

In [0]:
# La mayor parte se utilizan para el conjunto de datos de entrenamiento, por ejemplo: 80%
# La menor parte se utilizan para el conjunto de datos de prueba, por ejemplo 20%

# seed : para umentar la reproductibilidad del modelo
trainDF, testDF = articulos.randomSplit([0.8,0.2], seed=42) # Definimos que el 80% de los datos serán para entrenamiento y el 20% para test

# Almacenamos la data de entrenamiento en caché, para que su uso se más rápido
print(trainDF.cache().count()) # cantidad de datos definidos para entrenamiento (train)
print(testDF.count()) # cantidad de datos definidos para prueba (test)

115
33


### Paso 2: Preprocesamiento de características
- Categorizaremos solo columnas numéricas, en este caso no consideraremos las de cadena, ya que no son relevantes para la predicción

In [0]:
# Importamos la libería que utilizaremos
from pyspark.ml.feature import VectorAssembler

# Definimos las columnas numéricas
numCols=["Word count","# of Links","# of comments","# Images video","Elapsed days"]

# Definimos el conjunto de variables que vienen de las variables categóricas que solo serían numéricas
assemblerInputs=numCols

# Varibale objetivo: # Shared
labelToIndex = "# Shares"
#vecAssembler = VectorAssembler(inputCol="# Shares", outputCol="label")

# Definimos una sola columna de características (entrada)
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

### Averiguar como definir el label para una columna numérica

### Paso 3: Definir el modelo

Utilizaremos un modelo de **regresión logística**, ya que se trata de un caso de clasificación binaria

In [0]:
# Importamos la librería necesaria
from pyspark.ml.classification import LogisticRegression

# Definimos el modelo
# features : variables predictoras
# label : varibale objetivo
lr=LogisticRegression(featuresCol="features", labelCol="label", regParam=1.0)

## Paso 4. Construir la canalización

In [0]:
# Importamos la libería necesaria
from pyspark.ml import Pipeline

# Definimos la canalización en función de las etapas creadas en los pasos anteriores
pipeline = Pipeline(stages=[labelToIndex,vecAssembler, lr])

# Definimos el modelo de canalización y entrenamos el dataset de entrenamiento
#pipelineModel = pipeline.fit(trainDF)

# Aplicamos el modelo de canalización al dataset de prueba
predDF=pipelineModel.transform(testDF) # realizamos una predicción con la función transforma con el dataset de test

Mostramos los resultados de la predicción

In [0]:
# features : conjunto de variables que unificamos
# label : variable objetivo, que convertimos a 0 y 1
# prediction: resultado que devuleve el modelo
# probability : certeza del modelo y la probabilidad que asigna a una clase

#display(predDF.select("features","label","prediction","probability"))