In [None]:
## Last amended: 15th Sep, 2021
## Myfolder: /home/ashok/Documents/spark/ml/adult
## Datasource: Adult dataset from UCI
##             https://archive.ics.uci.edu/ml/machine-learning-databases/adult/       => Data source

## Problem: Data Exploration of census data
#     Ref: https://docs.databricks.com/spark/latest/mllib/binary-classification-mllib-pipelines.html

## Objectives:

##		1. Learn importing data files in spark
##		2. Learn data preprocessing on spark


In [None]:

##******=IMPT=**********
## Before you do it
## Attempt Quick-demo
## in folder '1.demo'
##**********************


### Start hadoop

In [None]:
## A, Start hadoop

    $ ./allstart.sh

# OR

    $./quick_allstart.sh

#### Transfer files to hadoop

In [None]:
# B. Transfer requisite files to hdfs after making a new folder

"""


cd ~
hdfs dfs -rm -r  /user/ashok/data_files/adult
hdfs dfs -mkdir -p  /user/ashok/data_files/adult
hdfs dfs -put /cdata/adult/adult.data  /user/ashok/data_files/adult/
hdfs dfs -ls /user/ashok/data_files/adult
hdfs dfs -cat /user/ashok/data_files/adult/adult.data | more



""""""



In [None]:
## C. And then open pyspark, as:
#   $ pysparknb

In [None]:
# D. pyspark configuration UI is avaialble at following URL:

"""
	 http://localhost:4040
            
            
	 Check for parameters (under 'Environment'  tab) such as:
	 	spark.master
	 	spark.driver.host
	 	spark.driver.memory

	In local mode there is one jvm for both excutors and drivers
    
"""    

## Call libraries

In [12]:
## 1.0 Call libraries
# Ref : 
# See Left Panel of this page for available Classes:
#              http://spark.apache.org/docs/3.0.0/api/python/pyspark.ml.html#pyspark-ml-package
#     Right Panel of this page:
#             https://spark.apache.org/docs/latest/api/python/reference/pyspark.ml.html#vector-and-matrix

# 1.0

import numpy as np
import pandas as pd
import time

# 1.1 pyspatk sql functions:

from pyspark.sql.functions import col


In [2]:
# 1.2 Display multiple command outputs from a cell:

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# 1.3 Increase display width of notebook:

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Read data

In [5]:
## 2.0 Read data
#  2.0 Read the 'adult' dataset from spark-sql store ('adultdata' database)
#      And return the data as a spark DataFrame


# Minimum:
df = spark.read.csv(
                      path = "/user/ashok/data_files/adult/adult.data",
                      nullValue = "?"            # Null values in data have '?'
                      )


# 2.0.1
df.show(2)


+---+----------+------+----------+-------------+--------------+-------------+--------------+------+-----+------------+------------+--------------+--------------+------+
|_c0|       _c1|   _c2|       _c3|          _c4|           _c5|          _c6|           _c7|   _c8|  _c9|        _c10|        _c11|          _c12|          _c13|  _c14|
+---+----------+------+----------+-------------+--------------+-------------+--------------+------+-----+------------+------------+--------------+--------------+------+
|age| workclass|fnlwgt| education|education_num|marital_status|   occupation|  relationship|  race|  sex|capital_gain|capital_loss|hours_per_week|native-country|income|
| 39| State-gov| 77516| Bachelors|           13| Never-married| Adm-clerical| Not-in-family| White| Male|        2174|           0|            40| United-States| <=50K|
+---+----------+------+----------+-------------+--------------+-------------+--------------+------+-----+------------+------------+--------------+---------

In [7]:
# 2.0.2 Slightly modify:
df = spark.read.csv(
                      path = "/user/ashok/data_files/adult/adult.data",
                      inferSchema = True,
                      nullValue = "?"            # Null values in data have '?'
                      )


# 2.0.3
df.show(2)


+---+----------+------+----------+-------------+--------------+-------------+--------------+------+-----+------------+------------+--------------+--------------+------+
|_c0|       _c1|   _c2|       _c3|          _c4|           _c5|          _c6|           _c7|   _c8|  _c9|        _c10|        _c11|          _c12|          _c13|  _c14|
+---+----------+------+----------+-------------+--------------+-------------+--------------+------+-----+------------+------------+--------------+--------------+------+
|age| workclass|fnlwgt| education|education_num|marital_status|   occupation|  relationship|  race|  sex|capital_gain|capital_loss|hours_per_week|native-country|income|
| 39| State-gov| 77516| Bachelors|           13| Never-married| Adm-clerical| Not-in-family| White| Male|        2174|           0|            40| United-States| <=50K|
+---+----------+------+----------+-------------+--------------+-------------+--------------+------+-----+------------+------------+--------------+---------

In [9]:
# 2.0.4 Slight modify:
df = spark.read.csv(
                      path = "/user/ashok/data_files/adult/adult.data",
                      inferSchema = True,
                      header = True,
                      nullValue = "?"            # Null values in data have '?'
                      )


# 2.0.5
df.show(2)

+---+-----------------+-------+----------+-------------+-------------------+----------------+--------------+------+-----+------------+------------+--------------+--------------+------+
|age|        workclass| fnlwgt| education|education_num|     marital_status|      occupation|  relationship|  race|  sex|capital_gain|capital_loss|hours_per_week|native-country|income|
+---+-----------------+-------+----------+-------------+-------------------+----------------+--------------+------+-----+------------+------------+--------------+--------------+------+
| 39|        State-gov|77516.0| Bachelors|         13.0|      Never-married|    Adm-clerical| Not-in-family| White| Male|      2174.0|         0.0|          40.0| United-States| <=50K|
| 50| Self-emp-not-inc|83311.0| Bachelors|         13.0| Married-civ-spouse| Exec-managerial|       Husband| White| Male|         0.0|         0.0|          13.0| United-States| <=50K|
+---+-----------------+-------+----------+-------------+-------------------

In [13]:
# 2.0.6 More elaborate but desireable:

df = spark.read.csv(
                      path = "/user/ashok/data_files/adult/adult.data",
                      inferSchema = True,               # default False
                      header = True,                    # defalut False 
                      nullValue = "?",                 # Null values in data have '?'

                      ignoreLeadingWhiteSpace = True,   # deafult False
                      ignoreTrailingWhiteSpace= True,   # default False
                      
                      )

# 2.0.7
df.show(2)

+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+----+------------+------------+--------------+--------------+------+
|age|       workclass|fnlwgt|education|education_num|    marital_status|     occupation| relationship| race| sex|capital_gain|capital_loss|hours_per_week|native-country|income|
+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+----+------------+------------+--------------+--------------+------+
| 39|       State-gov| 77516|Bachelors|           13|     Never-married|   Adm-clerical|Not-in-family|White|Male|        2174|           0|            40| United-States| <=50K|
| 50|Self-emp-not-inc| 83311|Bachelors|           13|Married-civ-spouse|Exec-managerial|      Husband|White|Male|           0|           0|            13| United-States| <=50K|
+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+----+--

In [None]:
## 2.0.8 Spark DataFrame manipulation
#        Ref: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame

print("\n-----Schema----")



print("\n-----Data types----")



In [None]:
# 2.1 take() is same as head()
#     Both return a list or Row types.
#     Try:




In [None]:
# 2.2 Target columns values:




In [None]:
# 2.3 How many rows are there?
#     Output is a scalar and NOT DataFrame



In [None]:
# 3.1 Print value-counts of target:


In [None]:
# 3.1.1 Display % value-counts



In [22]:
# 3.2 Descibe a summary of complete dataframe
#     or of specific columns (say, 'income' column)




+-------+------+
|summary|income|
+-------+------+
|  count| 32561|
|   mean|  null|
| stddev|  null|
|    min| <=50K|
|    max|  >50K|
+-------+------+



In [None]:
# 3.3 summary() gives more control over statistics 
#       than 'describe()'. Just display 10% point:



### select() method

In [None]:
# 4.1 Show one column, say age



# OR



# OR



# OR



In [None]:
# 4.2 Show multiple columns, say age and workclass



# OR



# OR



# OR



### filter() method

In [None]:
# 4.4 Filter DataFrame with filter command
#     Many ways to filter
#       'where' is an alias for filter
#     Filter for age > 21, for age = 21



# OR



# OR



# OR



# OR



# OR



In [None]:
# 4.5: Filter for marital_status of "Never-married" and age > 21




### groupBy()

In [None]:
# 4.6 GroupBy age and find mean capital_gain



In [None]:
# 4.6.3 Group by age. Find mean capital_gain and maximum capital_loss


### to_pandas()

In [None]:
# 6.0 Transforming filtered data to numpy array
#     Maybe for plotting.
#     Get data for age = 21 as numpy array:






In [None]:
# 6.1 Return a pandas dataframe



======================================================================

In [5]:
## 2.0.8 Spark DataFrame manipulation
#        Ref: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame

print("\n-----Schema----")
df.printSchema()
print("\n-----Data types----")
print(df.dtypes)


-----Schema----
root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: integer (nullable = true)
 |-- capital_loss: integer (nullable = true)
 |-- hours_per_week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)


-----Data types----
[('age', 'int'), ('workclass', 'string'), ('fnlwgt', 'int'), ('education', 'string'), ('education_num', 'int'), ('marital_status', 'string'), ('occupation', 'string'), ('relationship', 'string'), ('race', 'string'), ('sex', 'string'), ('capital_gain', 'int'), ('capital_loss', 'int'), ('hours_per_week', 'int'), ('native-country'

In [None]:
# 2.1 take() is same as head()
#     Both return a list or Row types

df.head(3)
df.take(3)

In [6]:
# 2.2 Target columns values:

df.select('income').show(4)

+------+
|income|
+------+
| <=50K|
| <=50K|
| <=50K|
| <=50K|
+------+
only showing top 4 rows



In [7]:
# 2.3 How many rows are there?
#     Output is a scalar and NOT DataFrame

df.count()      # 32561

32561

In [16]:
# 3.1 Value counts of target

dx = df.groupby('income').count()
dx.show()



+------+-----+
|income|count|
+------+-----+
| <=50K|24720|
|  >50K| 7841|
+------+-----+



In [17]:
# 3.1.1 Display % value-counts

dx = dx.select("income", col("count")/df.count()).show()

+------+------------------+
|income|   (count / 32561)|
+------+------------------+
| <=50K|0.7591904425539756|
|  >50K|0.2408095574460244|
+------+------------------+



In [10]:
# 3.2 Descibe a summary of dataframe
#     or specific columns
#     Similar to pandas method

df.describe().show()    # This is a mess
print()
df.describe(['age', 'workclass']).show()   # This is better

+-------+------------------+-----------+------------------+------------+-----------------+--------------+----------------+------------+------------------+------+------------------+----------------+------------------+--------------+------+
|summary|               age|  workclass|            fnlwgt|   education|    education_num|marital_status|      occupation|relationship|              race|   sex|      capital_gain|    capital_loss|    hours_per_week|native-country|income|
+-------+------------------+-----------+------------------+------------+-----------------+--------------+----------------+------------+------------------+------+------------------+----------------+------------------+--------------+------+
|  count|             32561|      30725|             32561|       32561|            32561|         32561|           30718|       32561|             32561| 32561|             32561|           32561|             32561|         31978| 32561|
|   mean| 38.58164675532078|       null|1897

In [11]:
# 3.3   summary() gives more control over statistics 
#       than 'describe()'. Just display 10% point:


df.select("age").summary("10%").show()

+-------+---+
|summary|age|
+-------+---+
|    10%| 22|
+-------+---+



### select()

In [12]:
# 4.0 import col function:

from pyspark.sql.functions import col

In [12]:
# 4.1 Show one column

df.select("age").show(2)     # Not as: df["age"]

# OR

df.select(df["age"]).show(2)

# OR

df.select(col("age")).show(2)

# OR

df.select(df.age).show(2)


+---+
|age|
+---+
| 39|
| 50|
+---+
only showing top 2 rows

+---+
|age|
+---+
| 39|
| 50|
+---+
only showing top 2 rows

+---+
|age|
+---+
| 39|
| 50|
+---+
only showing top 2 rows

+---+
|age|
+---+
| 39|
| 50|
+---+
only showing top 2 rows



In [13]:
# 4.2 Show multiple columns

df.select("age", "workclass").show(2)

# OR

df.select(*["age", "workclass"]).show(2)   # Note the '*'

# OR

df.select(["age", "workclass"]).show(2)

# OR

df.select(df["age"], df["workclass"]).show(2)


+---+----------------+
|age|       workclass|
+---+----------------+
| 39|       State-gov|
| 50|Self-emp-not-inc|
+---+----------------+
only showing top 2 rows

+---+----------------+
|age|       workclass|
+---+----------------+
| 39|       State-gov|
| 50|Self-emp-not-inc|
+---+----------------+
only showing top 2 rows

+---+----------------+
|age|       workclass|
+---+----------------+
| 39|       State-gov|
| 50|Self-emp-not-inc|
+---+----------------+
only showing top 2 rows

+---+----------------+
|age|       workclass|
+---+----------------+
| 39|       State-gov|
| 50|Self-emp-not-inc|
+---+----------------+
only showing top 2 rows



In [15]:
# 4.3 But these do not work:
# df.select(df[,["age","workclass"]]).show()
# df.select(df[["age","workclass"]]).show()

### filter()

In [14]:
# 4.4 Filter DataFrame with filter command
#     Many ways to filter
#       'where' is an alias for filter
#     Filter for age > 21, for age = 21

df.filter("age  > 21" ).take(1)

# OR

df.filter(col("age") > 21).take(1)

# OR

df.filter(df["age"] == 21).take(1)   

# OR

df.filter(df.age == 21).take(1)

# OR

df.select("*").where(df.age == 21).take(1)

# OR

df.select("*").where("age == 21").take(1)


[Row(age=39, workclass='State-gov', fnlwgt=77516, education='Bachelors', education_num=13, marital_status='Never-married', occupation='Adm-clerical', relationship='Not-in-family', race='White', sex='Male', capital_gain=2174, capital_loss=0, hours_per_week=40, native-country='United-States', income='<=50K')]

[Row(age=39, workclass='State-gov', fnlwgt=77516, education='Bachelors', education_num=13, marital_status='Never-married', occupation='Adm-clerical', relationship='Not-in-family', race='White', sex='Male', capital_gain=2174, capital_loss=0, hours_per_week=40, native-country='United-States', income='<=50K')]

[Row(age=21, workclass='Private', fnlwgt=197200, education='Some-college', education_num=10, marital_status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='White', sex='Male', capital_gain=0, capital_loss=0, hours_per_week=40, native-country='United-States', income='<=50K')]

[Row(age=21, workclass='Private', fnlwgt=197200, education='Some-college', education_num=10, marital_status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='White', sex='Male', capital_gain=0, capital_loss=0, hours_per_week=40, native-country='United-States', income='<=50K')]

[Row(age=21, workclass='Private', fnlwgt=197200, education='Some-college', education_num=10, marital_status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='White', sex='Male', capital_gain=0, capital_loss=0, hours_per_week=40, native-country='United-States', income='<=50K')]

[Row(age=21, workclass='Private', fnlwgt=197200, education='Some-college', education_num=10, marital_status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='White', sex='Male', capital_gain=0, capital_loss=0, hours_per_week=40, native-country='United-States', income='<=50K')]

In [15]:
# 4.5: Filter for marital_status of "Never-married" and age > 21

df.filter(df["marital_status"]=="Never-married").take(1)   
df.filter((df["marital_status"] == "Never-married") & (df["age"] > 21) ).take(1)   # use '&' '|'  '~' for 'not'

[Row(age=39, workclass='State-gov', fnlwgt=77516, education='Bachelors', education_num=13, marital_status='Never-married', occupation='Adm-clerical', relationship='Not-in-family', race='White', sex='Male', capital_gain=2174, capital_loss=0, hours_per_week=40, native-country='United-States', income='<=50K')]

[Row(age=39, workclass='State-gov', fnlwgt=77516, education='Bachelors', education_num=13, marital_status='Never-married', occupation='Adm-clerical', relationship='Not-in-family', race='White', sex='Male', capital_gain=2174, capital_loss=0, hours_per_week=40, native-country='United-States', income='<=50K')]

### GroupBy()
See grouped data functions [here](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.GroupedData.html#pyspark-sql-groupeddata)

In [18]:
# 4.6 GroupBy age and find mean capital_gain

grouped = df.groupby("age")

In [19]:
# 4.6.1 USe agg(*exprs):

grouped.agg({'capital_gain' : "avg"}).take(3)

[Row(age=31, avg(capital_gain)=545.2274774774775),
 Row(age=85, avg(capital_gain)=0.0),
 Row(age=65, avg(capital_gain)=3523.7078651685392)]

In [20]:
# 4.6.2

grouped.avg('capital_gain').take(3)

[Row(age=31, avg(capital_gain)=545.2274774774775),
 Row(age=85, avg(capital_gain)=0.0),
 Row(age=65, avg(capital_gain)=3523.7078651685392)]

In [21]:
# 4.6.3 Group by age. Find mean capital_gain and maximum capital_loss

df.groupby("age").agg({'capital_gain' : "avg", 'capital_loss': 'max'}).take(3)

[Row(age=31, max(capital_loss)=2415, avg(capital_gain)=545.2274774774775),
 Row(age=85, max(capital_loss)=0, avg(capital_gain)=0.0),
 Row(age=65, max(capital_loss)=2377, avg(capital_gain)=3523.7078651685392)]

In [22]:
# 4.6.4

df.agg({'age' : "max"}).take(3)

[Row(max(age)=90)]

### Correlation
See [here](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.stat.Correlation.html)

In [23]:
# 5.0 Correlation of two columns
#      For multiple columns data must be
#      first assembled by VectorAssembler

df.corr('age', 'capital_gain')

0.07767449816599412

In [24]:
# 5.1 For multiple cols, first create a vector
#     then find correlation:

vc = VectorAssembler(
                      inputCols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week'],
                      outputCol = 'numeric'
                    )

dx = vc.transform(df).select('numeric')

In [25]:
# 5.2

from pyspark.ml.stat import Correlation
 
pearsonCorr = Correlation.corr(dx, 'numeric', 'pearson').collect()[0][0]
print(str(pearsonCorr).replace('nan', 'NaN'))


DenseMatrix([[ 1.        ,  0.0776745 ,  0.05777454,  0.06875571],
             [ 0.0776745 ,  1.        , -0.03161506,  0.07840862],
             [ 0.05777454, -0.03161506,  1.        ,  0.05425636],
             [ 0.06875571,  0.07840862,  0.05425636,  1.        ]])


In [26]:
# 5.3 Define a function to do the work

def num_corr(df,cols):
    vc = VectorAssembler(
                         inputCols = cols,
                         outputCol = 'numeric'
                        )
    dx = vc.transform(df).select('numeric')
    pearsonCorr = Correlation.corr(dx, 'numeric', 'pearson').collect()[0][0]
    print(str(pearsonCorr).replace('nan', 'NaN'))
    

In [27]:
# 5.4 Use the function

cols =  ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
num_corr(df,cols)

DenseMatrix([[ 1.        ,  0.0776745 ,  0.05777454,  0.06875571],
             [ 0.0776745 ,  1.        , -0.03161506,  0.07840862],
             [ 0.05777454, -0.03161506,  1.        ,  0.05425636],
             [ 0.06875571,  0.07840862,  0.05425636,  1.        ]])


### To pandas or to numpy
Maybe for plotting 

In [28]:
# 6.0 Transforming filtered data to numpy array
#     Maybe for plotting:

np.array(df.filter(df.age == 21).take(2))

# OR full data

np.array(df.filter(df.age == 21).collect())


array([['21', 'Private', '197200', 'Some-college', '10', 'Never-married',
        'Machine-op-inspct', 'Own-child', 'White', 'Male', '0', '0',
        '40', 'United-States', '<=50K'],
       ['21', 'Private', '199915', 'Some-college', '10', 'Never-married',
        'Other-service', 'Own-child', 'White', 'Female', '0', '0', '40',
        'United-States', '<=50K']], dtype='<U21')

array([[21, 'Private', 197200, ..., 40, 'United-States', '<=50K'],
       [21, 'Private', 199915, ..., 40, 'United-States', '<=50K'],
       [21, 'Private', 296158, ..., 35, 'United-States', '<=50K'],
       ...,
       [21, 'Private', 67804, ..., 20, 'United-States', '<=50K'],
       [21, None, 205939, ..., 40, 'United-States', '<=50K'],
       [21, 'Private', 182117, ..., 40, 'United-States', '<=50K']],
      dtype=object)

In [29]:
# 6.1 Return a pandas dataframe

abc = df.toPandas()
abc.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


### Null values

In [30]:
# 7.0 Per column how many null values:

from pyspark.sql.functions import isnan, when, count, col

def null_values(data):
  data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

In [31]:
# 7.1 Use the function:

null_values(df)

+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt|education|education_num|marital_status|occupation|relationship|race|sex|capital_gain|capital_loss|hours_per_week|native-country|income|
+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
|  0|     1836|     0|        0|            0|             0|      1843|           0|   0|  0|           0|           0|             0|           583|     0|
+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+



In [32]:
# 7.2 Use where filter
df.select('*').where(df.income.isNull()).count()

0

In [33]:
# 7.3
df.select('*').where(~df.income.isNull()).count()

32561

In [34]:
# 7.4
df = df.select('*').where(~df.income.isNull())

In [35]:
# 7.5
df.select('*').where(df.income.isNull()).count()

0

In [41]:
# 9.0 Getting mode of a feature--step-by-step

df.groupby('workclass').count().show(3)
df.groupby('workclass').count().orderBy("count").show(3)
df.groupby('workclass').count().orderBy("count",ascending = False).show(3)
df.groupby('workclass').count().orderBy("count",ascending = False).first()     # Print first Row

# 9.0.1 Row object:
df.groupby('workclass').count().orderBy("count",ascending = False).first()['workclass'] # Access values as dict values
df.groupby('workclass').count().orderBy("count",ascending = False).first().workclass    # Access values like attributes
df.groupby('workclass').count().orderBy("count",ascending = False).first()[0]           # Row object behaves as a dict

+----------------+-----+
|       workclass|count|
+----------------+-----+
|Self-emp-not-inc| 2541|
|            null| 1836|
|       Local-gov| 2093|
+----------------+-----+
only showing top 3 rows

+------------+-----+
|   workclass|count|
+------------+-----+
|Never-worked|    7|
| Without-pay|   14|
| Federal-gov|  960|
+------------+-----+
only showing top 3 rows

+----------------+-----+
|       workclass|count|
+----------------+-----+
|         Private|22696|
|Self-emp-not-inc| 2541|
|       Local-gov| 2093|
+----------------+-----+
only showing top 3 rows



Row(workclass='Private', count=22696)

'Private'

'Private'

'Private'

In [42]:
# 9.1 Find mode of each column
# Refer: https://stackoverflow.com/a/58279672

[[i,df.groupby(i).count().orderBy("count", ascending=False).first()[0]] for i in df.columns]

[['age', 36],
 ['workclass', 'Private'],
 ['fnlwgt', 164190],
 ['education', 'HS-grad'],
 ['education_num', 9],
 ['marital_status', 'Married-civ-spouse'],
 ['occupation', 'Prof-specialty'],
 ['relationship', 'Husband'],
 ['race', 'White'],
 ['sex', 'Male'],
 ['capital_gain', 0],
 ['capital_loss', 0],
 ['hours_per_week', 40],
 ['native-country', 'United-States'],
 ['income', '<=50K']]

#### Fill up NAs

In [43]:
# 9.2 Fill up NAs now:

df = df.fillna('Private', subset = ['workclass'])
df = df.fillna('Prof-specialty', subset = ['occupation'])
df = df.fillna('United-States', subset = ['native-country'])

In [44]:
# 9.3 Check if NAs have been filled?

null_values(df)

+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt|education|education_num|marital_status|occupation|relationship|race|sex|capital_gain|capital_loss|hours_per_week|native-country|income|
+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
|  0|        0|     0|        0|            0|             0|         0|           0|   0|  0|           0|           0|             0|             0|     0|
+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+



In [None]:
######################### I am done ######################################

In [None]:
######################################################################
# About OneHotEncoder
# A one-hot encoder that maps a column of category indices to a column of binary vectors,
#  with at most a single one-value per row that indicates the input category index. For
#   example with 5 categories, an input value of 2.0 would map to an output vector of
#   [0.0, 0.0, 1.0, 0.0]. The last category is not included by default (configurable via
#    dropLast) because it makes the vector entries sum up to one, and hence linearly dependent.
#     So an input value of 4.0 maps to [0.0, 0.0, 0.0, 0.0].
# Note
#  This is different from scikit-learn’s OneHotEncoder, which keeps all categories. The output
#    vectors are sparse.
##########################################################################

# About VectorAssembler
# A one-hot encoder that maps a column of category indices to a column of binary vectors,
#  with at most a single one-value per row that indicates the input category index. For
#   example with 5 categories, an input value of 2.0 would map to an output vector of
#   [0.0, 0.0, 1.0, 0.0]. The last category is not included by default (configurable via
#    dropLast) because it makes the vector entries sum up to one, and hence linearly dependent.
#     So an input value of 4.0 maps to [0.0, 0.0, 0.0, 0.0].
# Note
#  This is different from scikit-learn’s OneHotEncoder, which keeps all categories. The output
#    vectors are sparse.
##################################################

In [None]:
################################## DONE ###########################