In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [3]:
df = spark.read.json('./data/yelp_academic_dataset_business.json')

In [4]:
df.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [5]:
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+--------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|          city|               hours|is_open|     latitude|     longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+--------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{null, null, null...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...| Santa Barbara|                null|      0|   34.4266787|  -119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{null, null, null...|mpf3x-BjTdTEA3yCZ...|Shipping Centers,...|        Affton|{8:0-18:30, 0:0-0...|      1|   

In [7]:
df.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [9]:
df.describe()

DataFrame[summary: string, address: string, business_id: string, categories: string, city: string, is_open: string, latitude: string, longitude: string, name: string, postal_code: string, review_count: string, stars: string, state: string]

In [10]:
df.describe().show()

+-------+-----------------+--------------------+--------------------+-----------+-------------------+-----------------+------------------+--------------------+-----------------+------------------+------------------+------+
|summary|          address|         business_id|          categories|       city|            is_open|         latitude|         longitude|                name|      postal_code|      review_count|             stars| state|
+-------+-----------------+--------------------+--------------------+-----------+-------------------+-----------------+------------------+--------------------+-----------------+------------------+------------------+------+
|  count|           150346|              150346|              150243|     150346|             150346|           150346|            150346|              150346|           150346|            150346|            150346|150346|
|   mean|7369.333333333333|                null|                null|       null| 0.7961502135075094|36.6711

In [11]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [13]:
data_schema = [StructField('address', IntegerType(), True), StructField('name', StringType(), True)]

In [14]:
final_struc = StructType(fields=data_schema)

In [16]:
df = spark.read.json('./data/yelp_academic_dataset_business.json', schema= final_struc)

In [17]:
df.printSchema()

root
 |-- address: integer (nullable = true)
 |-- name: string (nullable = true)



In [18]:
#PART TWO

In [20]:
type(df['address'])

pyspark.sql.column.Column

In [21]:
df.select('address').show()

+-------+
|address|
+-------+
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
|   null|
+-------+
only showing top 20 rows



In [22]:
type(df.select('address'))

pyspark.sql.dataframe.DataFrame

In [24]:
df.head(2)[0]

Row(address=None, name='Abby Rappoport, LAC, CMQ')

In [25]:
type(df.head(2)[0])

pyspark.sql.types.Row

In [27]:
df.select(['address','name']).show()

+-------+--------------------+
|address|                name|
+-------+--------------------+
|   null|Abby Rappoport, L...|
|   null|       The UPS Store|
|   null|              Target|
|   null|  St Honore Pastries|
|   null|Perkiomen Valley ...|
|   null|      Sonic Drive-In|
|   null|     Famous Footwear|
|   null|      Temple Beth-El|
|   null|Tsevi's Pub And G...|
|   null|      Sonic Drive-In|
|   null|           Marshalls|
|   null|Vietnamese Food T...|
|   null|             Denny's|
|   null|        Adams Dental|
|   null|Zio's Italian Market|
|   null|            Tuna Bar|
|   null|Arizona Truck Out...|
|   null|      Herb Import Co|
|   null|    Nifty Car Rental|
|   null|                 BAP|
+-------+--------------------+
only showing top 20 rows



In [28]:
df.withColumn('newage', df['address']).show()

+-------+--------------------+------+
|address|                name|newage|
+-------+--------------------+------+
|   null|Abby Rappoport, L...|  null|
|   null|       The UPS Store|  null|
|   null|              Target|  null|
|   null|  St Honore Pastries|  null|
|   null|Perkiomen Valley ...|  null|
|   null|      Sonic Drive-In|  null|
|   null|     Famous Footwear|  null|
|   null|      Temple Beth-El|  null|
|   null|Tsevi's Pub And G...|  null|
|   null|      Sonic Drive-In|  null|
|   null|           Marshalls|  null|
|   null|Vietnamese Food T...|  null|
|   null|             Denny's|  null|
|   null|        Adams Dental|  null|
|   null|Zio's Italian Market|  null|
|   null|            Tuna Bar|  null|
|   null|Arizona Truck Out...|  null|
|   null|      Herb Import Co|  null|
|   null|    Nifty Car Rental|  null|
|   null|                 BAP|  null|
+-------+--------------------+------+
only showing top 20 rows



In [29]:
df.withColumnRenamed('address', 'my_new_age').show()

+----------+--------------------+
|my_new_age|                name|
+----------+--------------------+
|      null|Abby Rappoport, L...|
|      null|       The UPS Store|
|      null|              Target|
|      null|  St Honore Pastries|
|      null|Perkiomen Valley ...|
|      null|      Sonic Drive-In|
|      null|     Famous Footwear|
|      null|      Temple Beth-El|
|      null|Tsevi's Pub And G...|
|      null|      Sonic Drive-In|
|      null|           Marshalls|
|      null|Vietnamese Food T...|
|      null|             Denny's|
|      null|        Adams Dental|
|      null|Zio's Italian Market|
|      null|            Tuna Bar|
|      null|Arizona Truck Out...|
|      null|      Herb Import Co|
|      null|    Nifty Car Rental|
|      null|                 BAP|
+----------+--------------------+
only showing top 20 rows



In [30]:
df.createOrReplaceTempView('people')

In [32]:
results = spark.sql('Select * FROM people')

In [33]:
results.show()

+-------+--------------------+
|address|                name|
+-------+--------------------+
|   null|Abby Rappoport, L...|
|   null|       The UPS Store|
|   null|              Target|
|   null|  St Honore Pastries|
|   null|Perkiomen Valley ...|
|   null|      Sonic Drive-In|
|   null|     Famous Footwear|
|   null|      Temple Beth-El|
|   null|Tsevi's Pub And G...|
|   null|      Sonic Drive-In|
|   null|           Marshalls|
|   null|Vietnamese Food T...|
|   null|             Denny's|
|   null|        Adams Dental|
|   null|Zio's Italian Market|
|   null|            Tuna Bar|
|   null|Arizona Truck Out...|
|   null|      Herb Import Co|
|   null|    Nifty Car Rental|
|   null|                 BAP|
+-------+--------------------+
only showing top 20 rows

