In [None]:
print("test!!!")

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Basics").getOrCreate()


In [116]:
#TODO: make a method out of this.

import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_241.jdk/Contents/Home'

import subprocess
subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT)


b'java version "1.8.0_241"\nJava(TM) SE Runtime Environment (build 1.8.0_241-b07)\nJava HotSpot(TM) 64-Bit Server VM (build 25.241-b07, mixed mode)\n'

In [117]:
df = spark.read.parquet("data/final_join.parquet")


In [118]:
df.show()

+---------+------------+---------+------+----------+--------------------+--------------+-----------------+----------+--------------------+--------------------+--------------------------+
|person_id|   firstName| lastName|gender|  birthday| creationDate_person|    locationIP|      browserUsed|comment_id|creationDate_comment|               email|creationDate_comment_month|
+---------+------------+---------+------+----------+--------------------+--------------+-----------------+----------+--------------------+--------------------+--------------------------+
|     8217|   Aleksandr|     Bajt|  male|1986-01-10|2019-01-25 18:54:...|109.239.36.182|          Firefox|      2573|2019-03-13 06:21:...|Aleksandr8217@hot...|                         3|
|    17973|          Li|      Wei|female|1986-06-29|2019-01-24 02:05:...|     1.4.3.239|           Chrome|     11826|2019-09-03 11:13:...|     Li17973@gmx.com|                         9|
|     3817|         Jun|    Zhang|female|1984-08-19|2019-08-17 10

First we'll drop some columns and impute some others and after that we'll do some basic data analysis, 
largely based on this post: https://medium.com/@aieeshashafique/exploratory-data-analysis-using-pyspark-dataframe-in-python-bd55c02a2852
(part1, this file)

Then We'll do some more advanced EDA, 
after which we'll choose a cutoff value which will allow us to define "an inactive user" (part 2)

In [119]:
#dropping first- and lastname columns
df = df.select([c for c in df.columns if c not in {'lastName','firstName'}])

# Adding a lot of datetime columns
from pyspark.sql.functions import dayofmonth,dayofweek,dayofyear,year
columns = [dayofmonth,dayofweek,dayofyear,year]
for x in columns:
    df = df.withColumn(f"creationDate_comment_{x.__name__}", x(df['creationDate_comment']))

# extracting domain name from emails
from pyspark.sql.functions import split
df = df.withColumn('email', split(df['email'], '@')[1])

#Hardcoding the year in is not ideal, but for our purposes allowable 
df = df.withColumn('age', 2020 - year(df['birthday'])).drop('birthday')


In [120]:
df.show()

+---------+------+--------------------+--------------+-----------------+----------+--------------------+-----------+--------------------------+-------------------------------+------------------------------+------------------------------+-------------------------+---+
|person_id|gender| creationDate_person|    locationIP|      browserUsed|comment_id|creationDate_comment|      email|creationDate_comment_month|creationDate_comment_dayofmonth|creationDate_comment_dayofweek|creationDate_comment_dayofyear|creationDate_comment_year|age|
+---------+------+--------------------+--------------+-----------------+----------+--------------------+-----------+--------------------------+-------------------------------+------------------------------+------------------------------+-------------------------+---+
|     8217|  male|2019-01-25 18:54:...|109.239.36.182|          Firefox|      2573|2019-03-13 06:21:...|hotmail.com|                         3|                             13|                     

In [121]:
# 1.schema of pyspark dataframe

df.printSchema()


root
 |-- person_id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- creationDate_person: timestamp (nullable = true)
 |-- locationIP: string (nullable = true)
 |-- browserUsed: string (nullable = true)
 |-- comment_id: integer (nullable = true)
 |-- creationDate_comment: timestamp (nullable = true)
 |-- email: string (nullable = true)
 |-- creationDate_comment_month: integer (nullable = true)
 |-- creationDate_comment_dayofmonth: integer (nullable = true)
 |-- creationDate_comment_dayofweek: integer (nullable = true)
 |-- creationDate_comment_dayofyear: integer (nullable = true)
 |-- creationDate_comment_year: integer (nullable = true)
 |-- age: integer (nullable = true)



In [122]:
# 2.Show your PySpark Dataframe 

df.show()

+---------+------+--------------------+--------------+-----------------+----------+--------------------+-----------+--------------------------+-------------------------------+------------------------------+------------------------------+-------------------------+---+
|person_id|gender| creationDate_person|    locationIP|      browserUsed|comment_id|creationDate_comment|      email|creationDate_comment_month|creationDate_comment_dayofmonth|creationDate_comment_dayofweek|creationDate_comment_dayofyear|creationDate_comment_year|age|
+---------+------+--------------------+--------------+-----------------+----------+--------------------+-----------+--------------------------+-------------------------------+------------------------------+------------------------------+-------------------------+---+
|     8217|  male|2019-01-25 18:54:...|109.239.36.182|          Firefox|      2573|2019-03-13 06:21:...|hotmail.com|                         3|                             13|                     

In [123]:
# 3. Count function of PySpark Dataframe

df.count()


40989

In [124]:
# 4. Statistical Properties of each column (not that interesting for this dataframe, misschien beter zelfs gewoon schrappen)

columns = ['gender', 'browserUsed', 'creationDate_comment_month']
for col in columns:
    print(col)
    df.describe([col]).show()


gender
+-------+------+
|summary|gender|
+-------+------+
|  count| 40989|
|   mean|  null|
| stddev|  null|
|    min|female|
|    max|  male|
+-------+------+

browserUsed
+-------+-----------+
|summary|browserUsed|
+-------+-----------+
|  count|      40989|
|   mean|       null|
| stddev|       null|
|    min|     Chrome|
|    max|     Safari|
+-------+-----------+

creationDate_comment_month
+-------+--------------------------+
|summary|creationDate_comment_month|
+-------+--------------------------+
|  count|                     40989|
|   mean|         7.693039595989168|
| stddev|        2.6304448992200773|
|    min|                         1|
|    max|                        11|
+-------+--------------------------+



# 5. Find unique values of a categorical column
columns = ['browserUsed', 'gender']
for x in columns:
    print(df.select(x).distinct().rdd.map(lambda r: r[0]).collect())



In [125]:
#7. Count the missing values in a column of PySpark Dataframe 


for col in df.columns:
    print(col, "with null values: ", df.filter(df[col].isNull()).count())
    

person_id with null values:  0
gender with null values:  0
creationDate_person with null values:  0
locationIP with null values:  0
browserUsed with null values:  0
comment_id with null values:  0
creationDate_comment with null values:  0
email with null values:  0
creationDate_comment_month with null values:  0
creationDate_comment_dayofmonth with null values:  0
creationDate_comment_dayofweek with null values:  0
creationDate_comment_dayofyear with null values:  0
creationDate_comment_year with null values:  0
age with null values:  0


In [126]:
df.write.save("data/Data_analysis_part1.parquet", mode='overwrite')


