In [2]:
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest
# $example off$

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("ChiSquareTestExample") \
        .getOrCreate()

    # $example on$
    data = [(0.0, Vectors.dense(0.5, 10.0)),
            (0.0, Vectors.dense(1.5, 20.0)),
            (1.0, Vectors.dense(1.5, 30.0)),
            (0.0, Vectors.dense(3.5, 30.0)),
            (0.0, Vectors.dense(3.5, 40.0)),
            (1.0, Vectors.dense(3.5, 40.0))]
    df = spark.createDataFrame(data, ["label", "features"])

    r = ChiSquareTest.test(df, "features", "label").head()
    print("pValues: " + str(r.pValues))
    print("degreesOfFreedom: " + str(r.degreesOfFreedom))
    print("statistics: " + str(r.statistics))
    # $example off$
    df.show()
    spark.stop()

pValues: [0.6872892787909721,0.6822703303362126]
degreesOfFreedom: [2, 3]
statistics: [0.75,1.5]
+-----+----------+
|label|  features|
+-----+----------+
|  0.0|[0.5,10.0]|
|  0.0|[1.5,20.0]|
|  1.0|[1.5,30.0]|
|  0.0|[3.5,30.0]|
|  0.0|[3.5,40.0]|
|  1.0|[3.5,40.0]|
+-----+----------+



In [31]:
import pandas as pd
url = 'https://raw.githubusercontent.com/Apress/learn-pyspark/master/chap_2/customer_data.csv'
d = pd.read_csv(url,index_col=0,parse_dates=[0])
d.to_csv("customer_data.csv")
d.where((d['Avg_Salary'] > 500000) & (d['Number_of_houses'] > 1)).describe()

Unnamed: 0,Number_of_houses,Avg_size_household,Avg_Salary,label
count,12.0,12.0,12.0,12.0
mean,2.333333,2.75,3170240.0,0.166667
std,0.492366,0.866025,6651876.0,0.389249
min,2.0,2.0,596723.0,0.0
25%,2.0,2.0,744382.2,0.0
50%,2.0,2.5,886349.0,0.0
75%,3.0,3.25,961325.2,0.0
max,3.0,4.0,23723160.0,1.0


In [4]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('data_processing').getOrCreate()
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [5]:
schema=StructType().add("user_id","string"). \
add("country","string").add("browser", "string"). \
add("OS",'string').add("age", "integer")

In [6]:
df=spark.createDataFrame([("A203",'India',"Chrome","WIN", 33),("A201",'China',"Safari","MacOS",35),("A205",'UK',"Mozilla", "Linux",25)],schema=schema)

In [8]:
df.printSchema()
df.show()

root
 |-- user_id: string (nullable = true)
 |-- country: string (nullable = true)
 |-- browser: string (nullable = true)
 |-- OS: string (nullable = true)
 |-- age: integer (nullable = true)

+-------+-------+-------+-----+---+
|user_id|country|browser|   OS|age|
+-------+-------+-------+-----+---+
|   A203|  India| Chrome|  WIN| 33|
|   A201|  China| Safari|MacOS| 35|
|   A205|     UK|Mozilla|Linux| 25|
+-------+-------+-------+-----+---+



In [18]:
df=spark.read.csv("customer_data.csv",header=True, inferSchema=True)

In [40]:
for col in df.columns:
    if col !='Avg_Salary':
        print(f" Aggregation for  {col}")
        df.groupBy(col).count().orderBy('count',ascending=False).show(truncate=False)

 Aggregation for  Customer_subtype
+------------------------------------------+-----+
|Customer_subtype                          |count|
+------------------------------------------+-----+
|Lower class large families                |288  |
|Traditional families                      |129  |
|Middle class families                     |122  |
|Large religious families                  |107  |
|Modern, complete families                 |93   |
|Couples with teens 'Married with children'|83   |
|Young and rising                          |78   |
|High status seniors                       |76   |
|Low income catholics                      |72   |
|Mixed seniors                             |71   |
|Village families                          |68   |
|Mixed rurals                              |67   |
|Stable family                             |62   |
|Young all american family                 |62   |
|Young, low educated                       |56   |
|Large family, employed child              |56 

In [43]:
df.groupby("Customer_subtype").agg(F.collect_set("Number_of_houses")).show()

+--------------------+-----------------------------+
|    Customer_subtype|collect_set(Number_of_houses)|
+--------------------+-----------------------------+
|Large family, emp...|                       [1, 2]|
|Religious elderly...|                       [1, 2]|
|Large religious f...|                       [1, 2]|
|Modern, complete ...|                       [1, 2]|
|    Village families|                       [1, 2]|
|Young all america...|                       [1, 2]|
|Young urban have-...|                       [1, 2]|
|Young seniors in ...|                    [1, 2, 3]|
|Fresh masters in ...|                          [1]|
|High Income, expe...|                          [1]|
|Lower class large...|                       [1, 2]|
| Residential elderly|                    [1, 2, 3]|
|Senior cosmopolitans|                          [3]|
|        Mixed rurals|                          [1]|
|Career and childcare|                       [1, 2]|
|Low income catholics|                        