* Dropping column
* Dropping row
* Various parameter in dropping functionalities
* Handling missing values by mean

In [1]:
import pyspark
import pandas as pd

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('tercero').getOrCreate()

In [4]:
spark

In [74]:
df = spark.read.csv('data.csv',sep=';',header=True,inferSchema=True)

In [75]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salarie: integer (nullable = true)



In [76]:
df.show()

+-------+----+-------+
|   name| age|salarie|
+-------+----+-------+
|  maria|  29| 112312|
|  jorge|  35| 123423|
|jessica|  32|  25343|
|  erick|  20|  35464|
|   null|  22|   null|
| lucero|null|  43211|
|   null|null|   null|
+-------+----+-------+



In [77]:
df.describe().show()

+-------+-----+------------------+-----------------+
|summary| name|               age|          salarie|
+-------+-----+------------------+-----------------+
|  count|    5|                 5|                5|
|   mean| null|              27.6|          67950.6|
| stddev| null|6.4265076052238514|46173.46515370056|
|    min|erick|                20|            25343|
|    max|maria|                35|           123423|
+-------+-----+------------------+-----------------+



In [78]:
df.columns

['name', 'age', 'salarie']

In [79]:
df = df.withColumn('age_2',df['age']+2)

In [80]:
df.show()

+-------+----+-------+-----+
|   name| age|salarie|age_2|
+-------+----+-------+-----+
|  maria|  29| 112312|   31|
|  jorge|  35| 123423|   37|
|jessica|  32|  25343|   34|
|  erick|  20|  35464|   22|
|   null|  22|   null|   24|
| lucero|null|  43211| null|
|   null|null|   null| null|
+-------+----+-------+-----+



In [81]:
df = df.drop('age_2')

In [82]:
df.show()

+-------+----+-------+
|   name| age|salarie|
+-------+----+-------+
|  maria|  29| 112312|
|  jorge|  35| 123423|
|jessica|  32|  25343|
|  erick|  20|  35464|
|   null|  22|   null|
| lucero|null|  43211|
|   null|null|   null|
+-------+----+-------+



In [83]:
# Eliminando valores nulos

In [84]:
df.dropna('all').show()

+-------+----+-------+
|   name| age|salarie|
+-------+----+-------+
|  maria|  29| 112312|
|  jorge|  35| 123423|
|jessica|  32|  25343|
|  erick|  20|  35464|
|   null|  22|   null|
| lucero|null|  43211|
+-------+----+-------+



In [87]:
df.dropna('any',thresh=2).show() # tresh nos dice que se eliminaran aquellos que como minimo tengan 2 null. Para este caso

+-------+----+-------+
|   name| age|salarie|
+-------+----+-------+
|  maria|  29| 112312|
|  jorge|  35| 123423|
|jessica|  32|  25343|
|  erick|  20|  35464|
| lucero|null|  43211|
+-------+----+-------+



In [90]:
df.dropna('any',subset=['salarie']).show()

+-------+----+-------+
|   name| age|salarie|
+-------+----+-------+
|  maria|  29| 112312|
|  jorge|  35| 123423|
|jessica|  32|  25343|
|  erick|  20|  35464|
| lucero|null|  43211|
+-------+----+-------+



**Llenando valores faltantes**

In [98]:
df.fillna({'name':'NONE','age':0,'salarie':0}).show() # se especifica por columna

+-------+---+-------+
|   name|age|salarie|
+-------+---+-------+
|  maria| 29| 112312|
|  jorge| 35| 123423|
|jessica| 32|  25343|
|  erick| 20|  35464|
|   NONE| 22|      0|
| lucero|  0|  43211|
|   NONE|  0|      0|
+-------+---+-------+



In [97]:
df.fillna(0).show() # si se le pone solo un valor de reemplazo este se colocara de acuerdo al tipo de dato y tipo de columna

+-------+---+-------+
|   name|age|salarie|
+-------+---+-------+
|  maria| 29| 112312|
|  jorge| 35| 123423|
|jessica| 32|  25343|
|  erick| 20|  35464|
|   null| 22|      0|
| lucero|  0|  43211|
|   null|  0|      0|
+-------+---+-------+



In [99]:
from pyspark.ml.feature import Imputer



In [100]:
imputer = Imputer(
    inputCols = ['age', 'salarie'],
    outputCols = ["{}_imputed".format(a) for a in ['age', 'salarie']]
).setStrategy("mean")

In [101]:
imputer.fit(df).transform(df).show()

+-------+----+-------+-----------+---------------+
|   name| age|salarie|age_imputed|salarie_imputed|
+-------+----+-------+-----------+---------------+
|  maria|  29| 112312|         29|         112312|
|  jorge|  35| 123423|         35|         123423|
|jessica|  32|  25343|         32|          25343|
|  erick|  20|  35464|         20|          35464|
|   null|  22|   null|         22|          67950|
| lucero|null|  43211|         27|          43211|
|   null|null|   null|         27|          67950|
+-------+----+-------+-----------+---------------+

