In [1]:
import findspark

In [2]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F

In [4]:
spark = SparkSession.builder \
.appName("Filter Multiple Columns") \
.master("local[2]") \
.getOrCreate()

2022-08-27 14:36:56,734 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
# Data source: https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe

In [5]:
#! wget -O ~/datasets/Hotel_Reviews.csv.gz \
#https://github.com/erkansirin78/datasets/raw/master/Hotel_Reviews.csv.gz

In [6]:
! ls -l ~/datasets | grep Hotel

-rw-rw-r--. 1 train train 46401315 Aug 27 11:25 Hotel_Reviews.csv.gz


In [7]:
from pyspark.sql.types import *

programmatical_schema = StructType([
        StructField("Hotel_Address",StringType(),True),
        StructField("Additional_Number_of_Scoring",IntegerType(),True),
        StructField("Review_Date",StringType(),True),
        StructField("Average_Score",FloatType(),True),
        StructField("Hotel_Name",StringType(),True),
        StructField("Reviewer_Nationality",StringType(),True),
        StructField("Negative_Review",StringType(),True),
        StructField("Review_Total_Negative_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews",IntegerType(),True),
        StructField("Positive_Review",StringType(),True),
        StructField("Review_Total_Positive_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews_Reviewer_Has_Given",IntegerType(),True),
        StructField("Reviewer_Score",FloatType(),True),
        StructField("Tags",StringType(),True),
        StructField("days_since_review",StringType(),True),
        StructField("lat",FloatType(),True),
        StructField("lng",FloatType(),True)
    ])

# StructField("Tags",ArrayType(StringType()),True)
# Actually Tags should be array but csv cannot store array type.
# So you have to define it as StringType 

# Review_Date is still StringType() and should be DateType() 
# But for the moment we intentioally leave it StringType()
# As soon as we put schema on data we will modify it.

In [8]:
df = spark.read.option("header", True) \
.schema(programmatical_schema) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [9]:
# Now we can correct Tags datatype
# But we have to do additional preperation before cast with split.
# And cast Review_Date to date
df2 = df.withColumn("Tags", 
                     F.split(F.col("Tags"), ",")
                     .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [10]:
df2.limit(2).toPandas()

                                                                                

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968


In [11]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: float (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: float (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lng: float (nullable = true)



# Method-1: Using reduce and lambda

In [12]:
# trim all columns

In [13]:
df2.dtypes

[('Hotel_Address', 'string'),
 ('Additional_Number_of_Scoring', 'int'),
 ('Review_Date', 'date'),
 ('Average_Score', 'float'),
 ('Hotel_Name', 'string'),
 ('Reviewer_Nationality', 'string'),
 ('Negative_Review', 'string'),
 ('Review_Total_Negative_Word_Counts', 'int'),
 ('Total_Number_of_Reviews', 'int'),
 ('Positive_Review', 'string'),
 ('Review_Total_Positive_Word_Counts', 'int'),
 ('Total_Number_of_Reviews_Reviewer_Has_Given', 'int'),
 ('Reviewer_Score', 'float'),
 ('Tags', 'array<string>'),
 ('days_since_review', 'string'),
 ('lat', 'float'),
 ('lng', 'float')]

In [14]:
for col in df2.dtypes:
    print(col)

('Hotel_Address', 'string')
('Additional_Number_of_Scoring', 'int')
('Review_Date', 'date')
('Average_Score', 'float')
('Hotel_Name', 'string')
('Reviewer_Nationality', 'string')
('Negative_Review', 'string')
('Review_Total_Negative_Word_Counts', 'int')
('Total_Number_of_Reviews', 'int')
('Positive_Review', 'string')
('Review_Total_Positive_Word_Counts', 'int')
('Total_Number_of_Reviews_Reviewer_Has_Given', 'int')
('Reviewer_Score', 'float')
('Tags', 'array<string>')
('days_since_review', 'string')
('lat', 'float')
('lng', 'float')


In [15]:
# filter string columns, because trim function can only be applied to string cols.
str_cols = [col[0] for col in df2.dtypes if col[1] == 'string']
print(str_cols)

['Hotel_Address', 'Hotel_Name', 'Reviewer_Nationality', 'Negative_Review', 'Positive_Review', 'days_since_review']


    # inline for loop with if 
    # [iterator for iterator in collection if]
    # inline for loop with if else
    # [iterator if else for iterator in collection ]

# Method-1

In [16]:
# We will use the reduce to transform multiple columns
from functools import reduce

In [17]:
# format
# reduce(lambda df, col: df.withColumn(col, trim(col)), [col1, col2, .., coln], df_to_trim)
# returns a dataframe

In [18]:
df3 = (reduce(
    lambda memo_df, col_name: memo_df.withColumn(col_name, F.trim(F.col(col_name))),
    str_cols,
    df2
))

In [19]:
df3.filter(" Reviewer_Nationality == 'United Kingdom' ").limit(5).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-31,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk bar...,210,1403,Great location in nice surroundings the bar an...,26,1,3.8,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",3 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-17,7.7,Hotel Arena,United Kingdom,Cleaner did not change our sheet and duvet eve...,33,1403,The room is spacious and bright The hotel is l...,18,6,4.6,"[[' Leisure trip ', ' Group ', ' Duplex Twin...",17 days,52.360577,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-17,7.7,Hotel Arena,United Kingdom,Apart from the price for the brekfast Everythi...,11,1403,Good location Set in a lovely park friendly st...,19,1,10.0,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",17 days,52.360577,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-07,7.7,Hotel Arena,United Kingdom,Nothing all great,5,1403,Rooms were stunningly decorated and really spa...,101,2,10.0,"[[' Leisure trip ', ' Group ', ' Duplex Doub...",27 days,52.360577,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-06,7.7,Hotel Arena,United Kingdom,The floor in my room was filfy dirty Very basi...,28,1403,Comfy bed good location,6,7,4.6,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",28 days,52.360577,4.915968


# Method-2: Using for loop

In [25]:
str_cols

['Hotel_Address',
 'Hotel_Name',
 'Reviewer_Nationality',
 'Negative_Review',
 'Positive_Review',
 'days_since_review']

In [26]:
df4 = df2
for col in str_cols:
    df4 = df4.withColumn(col, F.trim(col))

In [27]:
df4.filter(" Reviewer_Nationality == 'United Kingdom' ").limit(5).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-31,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk bar...,210,1403,Great location in nice surroundings the bar an...,26,1,3.8,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",3 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-17,7.7,Hotel Arena,United Kingdom,Cleaner did not change our sheet and duvet eve...,33,1403,The room is spacious and bright The hotel is l...,18,6,4.6,"[[' Leisure trip ', ' Group ', ' Duplex Twin...",17 days,52.360577,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-17,7.7,Hotel Arena,United Kingdom,Apart from the price for the brekfast Everythi...,11,1403,Good location Set in a lovely park friendly st...,19,1,10.0,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",17 days,52.360577,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-07,7.7,Hotel Arena,United Kingdom,Nothing all great,5,1403,Rooms were stunningly decorated and really spa...,101,2,10.0,"[[' Leisure trip ', ' Group ', ' Duplex Doub...",27 days,52.360577,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-06,7.7,Hotel Arena,United Kingdom,The floor in my room was filfy dirty Very basi...,28,1403,Comfy bed good location,6,7,4.6,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",28 days,52.360577,4.915968


## List columns that include null

In [31]:
df4.columns

['Hotel_Address',
 'Additional_Number_of_Scoring',
 'Review_Date',
 'Average_Score',
 'Hotel_Name',
 'Reviewer_Nationality',
 'Negative_Review',
 'Review_Total_Negative_Word_Counts',
 'Total_Number_of_Reviews',
 'Positive_Review',
 'Review_Total_Positive_Word_Counts',
 'Total_Number_of_Reviews_Reviewer_Has_Given',
 'Reviewer_Score',
 'Tags',
 'days_since_review',
 'lat',
 'lng']

In [32]:
for col in df4.columns:
    col_count = df4.filter( F.col(col).isNull() ).count()
    if ( col_count > 0):
        print("{} has {} null values.".format(col, col_count))

                                                                                

lat has 3268 null values.


[Stage 35:>                                                         (0 + 1) / 1]

lng has 3268 null values.


                                                                                

## List "" (hidden null), it is only possible for string columns.

In [33]:
for col in str_cols:
    col_count = df4.filter(F.col(col) == "").count()
    if ( col_count > 0):
        print("{} has {} hidden null values.".format(col, col_count))

                                                                                

Reviewer_Nationality has 523 hidden null values.


[Stage 43:>                                                         (0 + 1) / 1]

Negative_Review has 849 hidden null values.


                                                                                

Positive_Review has 183 hidden null values.


                                                                                

In [34]:
spark.stop()