In [1]:
import pyspark
import pandas as pd
import numpy as np
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
pd_df = pd.DataFrame([['r1c1','r1c2'],
      ['r2c1','r2c2'],
      ['r3c1','r3c2']],
     index = [1,2,3], 
     columns=['col1','col2'])

In [3]:
sp_df = spark.createDataFrame(pd_df)
sp_df

DataFrame[col1: string, col2: string]

In [4]:
pd_df

Unnamed: 0,col1,col2
1,r1c1,r1c2
2,r2c1,r2c2
3,r3c1,r3c2


In [5]:
sp_df.show()

+----+----+
|col1|col2|
+----+----+
|r1c1|r1c2|
|r2c1|r2c2|
|r3c1|r3c2|
+----+----+



In [6]:
#spark.read.load('myfile.csv', format='csv', sep=',')

In [8]:
from pydataset import data

mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [9]:
mpg.select(mpg.year, (mpg.year+1).alias('testing')).show(5)

+----+-------+
|year|testing|
+----+-------+
|1999|   2000|
|1999|   2000|
|2008|   2009|
|2008|   2009|
|1999|   2000|
+----+-------+
only showing top 5 rows



In [12]:
# from pyspark.sql.functions import col, expr, sum, avg, concat, 
# lit, regexp_extract, regexp_replace
from pyspark.sql.functions import *

In [13]:
col('hwy').alias('highway')
# mpg.hwy
expr('hwy AS highway')

Column<b'hwy AS `highway`'>

In [14]:
mpg.createOrReplaceTempView('view')

In [15]:
spark.sql(
'''
SELECT hwy, cty, (hwy+cty) / 2 AS avg
FROM view
''').show(5)

+---+---+----+
|hwy|cty| avg|
+---+---+----+
| 29| 18|23.5|
| 29| 21|25.0|
| 31| 20|25.5|
| 30| 21|25.5|
| 26| 16|21.0|
+---+---+----+
only showing top 5 rows



In [16]:
mpg.select(mpg.hwy.cast('string'))

DataFrame[hwy: string]

In [17]:
textdf = spark.createDataFrame(
    pd.DataFrame(
        {
            'address': [
                '600 Navarro St ste 600, San Antonio, TX 78206',
                '3130 Broadway St, San Antoino, TX 78209',
                '303 Pearl Pkwy, San Antonio, TX 78215',
                '1255 SW Loop 410, San Antonio, TX 78227'
            ]
        }
    ))

### How to create a schema and read files

In [20]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType(
    [
        StructField("source_id", StringType()),
        StructField("source_username", StringType()),
    ]
)

spark.read.csv("data/source.csv", header=True, schema=schema)
# spark.read.json("data/source.json", schema=schema)

DataFrame[source_id: string, source_username: string]

In [22]:
df = spark.createDataFrame(data('tips'))

train, test = df.randomSplit([0.8, 0.2], seed=42)

In [23]:
def shape(df: pyspark.sql.DataFrame):
    return df.count(), len(df.columns)

In [24]:
shape(train)

(191, 7)