###  Hypothesis Testing - ChiSquare

In [1]:
# Import Sparksession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("HypothesisTesting").getOrCreate()

In [2]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest

data = [(0.0, Vectors.dense(0.5, 10.0)),
        (0.0, Vectors.dense(1.5, 20.0)),
        (1.0, Vectors.dense(1.5, 30.0)),
        (0.0, Vectors.dense(3.5, 30.0)),
        (0.0, Vectors.dense(3.5, 40.0)),
        (1.0, Vectors.dense(3.5, 40.0))]
df = spark.createDataFrame(data, ["label", "features"])

r = ChiSquareTest.test(df, "features", "label").head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

pValues: [0.6872892787909721,0.6822703303362126]
degreesOfFreedom: [2, 3]
statistics: [0.75,1.5]


###  Hypothesis Testing -  Kolmogorov-Smirnov test:

In [3]:
from pyspark.mllib.stat import Statistics

parallelData = spark.sparkContext.parallelize([0.1, 0.15, 0.2, 0.3, 0.25])

# run a KS test for the sample versus a standard normal distribution
testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
# summary of the test including the p-value, test statistic, and null hypothesis
# if our p-value indicates significance, we can reject the null hypothesis
# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
# a lambda to calculate the CDF is not made available in the Python API
print(testResult)


Kolmogorov-Smirnov test summary:
degrees of freedom = 0 
statistic = 0.539827837277029 
pValue = 0.06821463111921133 
Low presumption against null hypothesis: Sample follows theoretical distribution.


### Random data generation

In [4]:
from pyspark.mllib.random import RandomRDDs
# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
u = RandomRDDs.normalRDD(spark.sparkContext, 1000000, 10)
# Apply a transform to get a random double RDD following `N(1, 4)`.
v = u.map(lambda x: 1.0 + 2.0 * x)

### Sampling

In [5]:
# Read data
file_location = "churn_modelling.csv"
file_type = "csv"
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

df = spark.read.format(file_type)\
.option("inferSchema", infer_schema)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load(file_location)

# Simple Random Sampling without replacement
df=df.select('Geography','NumOfProducts','Age','Gender','Tenure','Exited')
df_srs_without_rep = df.sample(False, 0.5, 23)
df_srs_without_rep.show(10,False)

+---------+-------------+---+------+------+------+
|Geography|NumOfProducts|Age|Gender|Tenure|Exited|
+---------+-------------+---+------+------+------+
|France   |3            |42 |Female|8     |1     |
|France   |2            |39 |Female|1     |0     |
|France   |2            |50 |Male  |7     |0     |
|France   |1            |27 |Male  |2     |0     |
|France   |2            |31 |Male  |6     |0     |
|Germany  |2            |45 |Male  |3     |0     |
|Germany  |1            |58 |Male  |1     |1     |
|France   |2            |41 |Male  |8     |0     |
|Spain    |2            |32 |Female|8     |0     |
|Spain    |1            |38 |Female|4     |1     |
+---------+-------------+---+------+------+------+
only showing top 10 rows



In [6]:
# Simple Random Sampling with replacement
df_srs_without_rep = df.sample(True, 0.5, 23)
df_srs_without_rep.show(10,False)


+---------+-------------+---+------+------+------+
|Geography|NumOfProducts|Age|Gender|Tenure|Exited|
+---------+-------------+---+------+------+------+
|Spain    |1            |41 |Female|1     |0     |
|France   |2            |39 |Female|1     |0     |
|Spain    |1            |43 |Female|2     |0     |
|Germany  |4            |29 |Female|4     |1     |
|France   |2            |31 |Male  |6     |0     |
|France   |2            |31 |Male  |6     |0     |
|France   |2            |25 |Female|5     |0     |
|Spain    |2            |32 |Female|8     |0     |
|Spain    |1            |38 |Female|4     |1     |
|France   |2            |46 |Male  |3     |0     |
+---------+-------------+---+------+------+------+
only showing top 10 rows



In [7]:
from pyspark.sql.types import IntegerType
df = df.withColumn("Exited", df["Exited"].cast(IntegerType()))

In [8]:
df.select('Exited').describe().show()
stratified_sampled = df.sampleBy("Exited", fractions={0: 0.5, 1: 0.5}, seed=23)

+-------+-------------------+
|summary|             Exited|
+-------+-------------------+
|  count|              10000|
|   mean|             0.2037|
| stddev|0.40276858399486065|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [9]:
stratified_sampled.select('Exited').describe().show()

+-------+-------------------+
|summary|             Exited|
+-------+-------------------+
|  count|               5022|
|   mean| 0.2078853046594982|
| stddev|0.40583469642655867|
|    min|                  0|
|    max|                  1|
+-------+-------------------+

