In [1]:
import findspark

In [2]:
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder \
.appName("Filter and Transform Multiple Columns") \
.master("local[2]") \
.getOrCreate()

In [5]:
df = spark.read \
.option("header",True) \
.option("inferSchema", True) \
.option("compression", "gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [6]:
df2 = df.withColumn("Tags",
                    F.split(F.col("Tags"), ",")
                    .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"), "M/d/yyyy"))

In [7]:
df2.limit(3).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[[' Leisure trip ', ' Family with young child...",3 days,52.3605759,4.9159683


In [8]:
df2.dtypes

[('Hotel_Address', 'string'),
 ('Additional_Number_of_Scoring', 'int'),
 ('Review_Date', 'date'),
 ('Average_Score', 'double'),
 ('Hotel_Name', 'string'),
 ('Reviewer_Nationality', 'string'),
 ('Negative_Review', 'string'),
 ('Review_Total_Negative_Word_Counts', 'int'),
 ('Total_Number_of_Reviews', 'int'),
 ('Positive_Review', 'string'),
 ('Review_Total_Positive_Word_Counts', 'int'),
 ('Total_Number_of_Reviews_Reviewer_Has_Given', 'int'),
 ('Reviewer_Score', 'double'),
 ('Tags', 'array<string>'),
 ('days_since_review', 'string'),
 ('lat', 'string'),
 ('lng', 'string')]

In [9]:
str_cols = []
for col_name in df2.dtypes:
    if col_name[1] == "string":
        str_cols.append(col_name[0])

In [10]:
str_cols

['Hotel_Address',
 'Hotel_Name',
 'Reviewer_Nationality',
 'Negative_Review',
 'Positive_Review',
 'days_since_review',
 'lat',
 'lng']

In [11]:
for col_name in str_cols:
    df2 = df2.withColumn(col_name, F.trim(col_name))

In [13]:
df2.filter("Reviewer_Nationality == 'United Kingdom'").limit(3).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-31,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk bar...,210,1403,Great location in nice surroundings the bar an...,26,1,3.8,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",3 days,52.3605759,4.9159683
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-17,7.7,Hotel Arena,United Kingdom,Cleaner did not change our sheet and duvet eve...,33,1403,The room is spacious and bright The hotel is l...,18,6,4.6,"[[' Leisure trip ', ' Group ', ' Duplex Twin...",17 days,52.3605759,4.9159683
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-17,7.7,Hotel Arena,United Kingdom,Apart from the price for the brekfast Everythi...,11,1403,Good location Set in a lovely park friendly st...,19,1,10.0,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",17 days,52.3605759,4.9159683


In [15]:
spark.stop()