In [19]:
from pyspark.sql import SparkSession as ss
from pyspark.sql.functions import col,count,when,isnan

In [5]:
spark = ss.builder.appName("myApp").config('spark.sql.shuffle.partitions', '20').getOrCreate()

###### Handling Missing Values

In [6]:
df_miss = spark.createDataFrame([
(1, 143.5, 5.6, 28, 'M', 100000),
(2, 167.2, 5.4, 45, 'M', None),
(3, None , 5.2, None, None, None),
(4, 144.5, 5.9, 33, 'M', None),
(5, 133.2, 5.7, 54, 'F', None),
(6, 124.1, 5.2, None, 'F', None),
(7, 129.2, 5.3, 42, 'M', 76000),
], ['id', 'weight', 'height', 'age', 'gender', 'income'])

In [230]:
df_miss.select('id', 'weight', 'height').where("height>5.5 and gender='M'").show()

+---+------+------+
| id|weight|height|
+---+------+------+
|  1| 143.5|   5.6|
|  4| 144.5|   5.9|
+---+------+------+



In [32]:
df_miss.show()

+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  3|  null|   5.2|null|  null|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
|  7| 129.2|   5.3|  42|     M| 76000|
+---+------+------+----+------+------+



In [None]:
df_miss.where(df_miss.id.isNotNull)

In [31]:
df_miss.select([count(when(  col(c).contains('NULL') | col(c).contains('None') | (col(c) == '' ) | col(c).isNull() | isnan(c) ,c)).alias(c) for c in df_miss.columns]).show()

+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
|  0|     1|     0|  2|     1|     5|
+---+------+------+---+------+------+



In [231]:
df_miss.select([ ( 1-(count(c)/count('*'))  ).alias(c+"_p") for c in df_miss.columns]).show()

+----+------------------+--------+------------------+------------------+
|id_p|          weight_p|height_p|             age_p|          gender_p|
+----+------------------+--------+------------------+------------------+
| 0.0|0.1428571428571429|     0.0|0.2857142857142857|0.1428571428571429|
+----+------------------+--------+------------------+------------------+



In [50]:
df_miss.where("income is not null and age>28").show()

+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
|  7| 129.2|   5.3| 42|     M| 76000|
+---+------+------+---+------+------+



In [62]:
df_miss.na.drop(subset=[c for c in df_miss.columns if c!='income']).show()

+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
|  1| 143.5|   5.6| 28|     M|100000|
|  2| 167.2|   5.4| 45|     M|  null|
|  4| 144.5|   5.9| 33|     M|  null|
|  5| 133.2|   5.7| 54|     F|  null|
|  7| 129.2|   5.3| 42|     M| 76000|
+---+------+------+---+------+------+



In [63]:
df_miss = df_miss.drop('income')

In [64]:
df_miss.show()

+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
|  7| 129.2|   5.3|  42|     M|
+---+------+------+----+------+



In [65]:
df_miss.rdd.getNumPartitions()

8

In [233]:
[(mean(c)).alias(c) for c in df_miss.columns]

[Column<b'avg(id) AS `id`'>,
 Column<b'avg(weight) AS `weight`'>,
 Column<b'avg(height) AS `height`'>,
 Column<b'avg(age) AS `age`'>,
 Column<b'avg(gender) AS `gender`'>]

In [235]:
from pyspark.sql.functions import mean
means = df_miss.select([(mean(c)).alias(c) for c in df_miss.columns]).toPandas()

In [237]:
means.at[0,'gender']='missing'

In [238]:
means

Unnamed: 0,id,weight,height,age,gender
0,4.0,140.283333,5.471429,40.4,missing


In [80]:
means = means.to_dict('records')[0]

In [81]:
means

{'age': 40.4,
 'gender': 'missing',
 'height': 5.471428571428571,
 'id': 4.0,
 'weight': 140.28333333333333}

In [82]:
df_miss.fillna(means).show()

+---+------------------+------+---+-------+
| id|            weight|height|age| gender|
+---+------------------+------+---+-------+
|  1|             143.5|   5.6| 28|      M|
|  2|             167.2|   5.4| 45|      M|
|  3|140.28333333333333|   5.2| 40|missing|
|  4|             144.5|   5.9| 33|      M|
|  5|             133.2|   5.7| 54|      F|
|  6|             124.1|   5.2| 40|      F|
|  7|             129.2|   5.3| 42|      M|
+---+------------------+------+---+-------+



In [111]:
df_nonull = df_miss.fillna(means)

In [239]:
df_nonull.show()

+---+------+------+---+
| id|weight|height|age|
+---+------+------+---+
|  1| 143.5|   5.3| 28|
|  2| 154.2|   5.5| 45|
|  3| 342.3|   5.1| 99|
|  4| 144.5|   5.5| 33|
|  5| 133.2|   5.4| 54|
|  6| 124.1|   5.1| 21|
|  7| 129.2|   5.3| 42|
+---+------+------+---+



In [83]:
df_miss.createOrReplaceTempView('miss')

In [95]:
cols = ['mean('+c+') as '+c for c in df_miss.columns]

In [101]:
cols  = str(cols)

In [102]:
cols

"['mean(id) as id', 'mean(weight) as weight', 'mean(height) as height', 'mean(age) as age', 'mean(gender) as gender']"

In [107]:
spark.sql('select '+ ",".join(['mean('+c+') as '+c for c in df_miss.columns])+' from miss').show()

+---+------------------+-----------------+----+------+
| id|            weight|           height| age|gender|
+---+------------------+-----------------+----+------+
|4.0|140.28333333333333|5.471428571428571|40.4|  null|
+---+------------------+-----------------+----+------+



##### handling Outliers

In [213]:
df_out = spark.createDataFrame([
(1, 143.5, 5.3, 28),
(2, 154.2, 5.5, 45),
(3, 342.3, 5.1, 99),
(4, 144.5, 5.5, 33),
(5, 133.2, 5.4, 54),
(6, 124.1, 5.1, 21),
(7, 129.2, 5.3, 42),
], ['id', 'weight', 'height', 'age'])

In [215]:
df_out.show()

+---+------+------+---+
| id|weight|height|age|
+---+------+------+---+
|  1| 143.5|   5.3| 28|
|  2| 154.2|   5.5| 45|
|  3| 342.3|   5.1| 99|
|  4| 144.5|   5.5| 33|
|  5| 133.2|   5.4| 54|
|  6| 124.1|   5.1| 21|
|  7| 129.2|   5.3| 42|
+---+------+------+---+



In [216]:
cols = [ c for c in  df_out.columns if c!='id' ]

In [217]:
cols

['weight', 'height', 'age']

In [218]:
bounds={}
for col in cols:
    if col=='id':
        bounds[col] = ['StringColumn']
    
    else:
        quantiles = df_out.approxQuantile(col,[0.25,0.75],0.05)
        IQR = quantiles[1] - quantiles[0]
        bounds[col] = [quantiles[0]- 1.5*IQR, quantiles[1]+1.5*IQR]
        
    

In [219]:
bounds

{'age': [-11.0, 93.0],
 'height': [4.499999999999999, 6.1000000000000005],
 'weight': [91.69999999999999, 191.7]}

In [222]:
df_out1 = df_out.select(['id']+[ ((df_out[c]<bounds[c][0]) | (df_out[c]>bounds[c][1])).alias(c+'_o')  for c in cols if c !='id' ])

In [223]:
df_out1.show()

+---+--------+--------+-----+
| id|weight_o|height_o|age_o|
+---+--------+--------+-----+
|  1|   false|   false|false|
|  2|   false|   false|false|
|  3|    true|   false| true|
|  4|   false|   false|false|
|  5|   false|   false|false|
|  6|   false|   false|false|
|  7|   false|   false|false|
+---+--------+--------+-----+



In [224]:
joinedDF = df_out.join(df_out1,on='id')

In [225]:
joinedDF.show()

+---+------+------+---+--------+--------+-----+
| id|weight|height|age|weight_o|height_o|age_o|
+---+------+------+---+--------+--------+-----+
|  4| 144.5|   5.5| 33|   false|   false|false|
|  6| 124.1|   5.1| 21|   false|   false|false|
|  3| 342.3|   5.1| 99|    true|   false| true|
|  2| 154.2|   5.5| 45|   false|   false|false|
|  5| 133.2|   5.4| 54|   false|   false|false|
|  1| 143.5|   5.3| 28|   false|   false|false|
|  7| 129.2|   5.3| 42|   false|   false|false|
+---+------+------+---+--------+--------+-----+



In [226]:
joinedDF.createOrReplaceTempView('out')

In [227]:
spark.sql('select * from out').show()

+---+------+------+---+--------+--------+-----+
| id|weight|height|age|weight_o|height_o|age_o|
+---+------+------+---+--------+--------+-----+
|  4| 144.5|   5.5| 33|   false|   false|false|
|  6| 124.1|   5.1| 21|   false|   false|false|
|  3| 342.3|   5.1| 99|    true|   false| true|
|  2| 154.2|   5.5| 45|   false|   false|false|
|  5| 133.2|   5.4| 54|   false|   false|false|
|  1| 143.5|   5.3| 28|   false|   false|false|
|  7| 129.2|   5.3| 42|   false|   false|false|
+---+------+------+---+--------+--------+-----+



In [228]:
spark.sql('select '+ ",".join([c for c in df_out.columns]) + ' from out').where(" or ".join([ c+'_o ' for c in cols])).show()

+---+------+------+---+
| id|weight|height|age|
+---+------+------+---+
|  3| 342.3|   5.1| 99|
+---+------+------+---+

