## Elim border grays

In [46]:
from IPython.core.display import display, HTML
display(HTML(
    '<style>'
        '#notebook { padding-top:0px !important; } ' 
        '.container { width:100% !important; } '
        '.end_space { min-height:0px !important; } '
    '</style>'
))

In [2]:
import findspark
findspark.init()

In [4]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext(conf= conf)

In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession(sc).builder.appName("basics").getOrCreate()

In [7]:
df  = spark.read.json("Resources/Python-and-Spark-for-Big-Data-master/Spark_DataFrames/people.json")

In [8]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [9]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [10]:
df.columns

['age', 'name']

In [12]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [13]:
from pyspark.sql.types import (StructField,StructType,IntegerType,StringType)

In [14]:
data_schema = [StructField('age', IntegerType(), True),
               StructField('name', StringType(), True)]

In [15]:
final_struct = StructType(fields = data_schema)

In [18]:
df = spark.read.json("Resources/Python-and-Spark-for-Big-Data-master/Spark_DataFrames/people.json", schema= final_struct)

In [20]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [21]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [22]:
df['age']

Column<b'age'>

In [30]:
df.select(['age','name']).head(2)[0]

Row(age=None, name='Michael')

In [31]:
df.withColumn('double_age',df['age']*2).show()

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



In [32]:
df.createOrReplaceTempView('people')

In [33]:
results = spark.sql("select * from people where age is not null")

In [35]:
results.show()

+---+------+
|age|  name|
+---+------+
| 30|  Andy|
| 19|Justin|
+---+------+



In [38]:
sc.stop()

##  New Spark Session

In [39]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('sparrrrk').setMaster('local')
sc = SparkContext(conf = conf)

In [40]:
from pyspark.sql import SparkSession

In [41]:
spark = SparkSession(sc).builder.appName('ops').getOrCreate()

In [43]:
df = spark.read.csv('Resources/Python-and-Spark-for-Big-Data-master/Spark_DataFrames/appl_stock.csv', inferSchema=True,
                   header = True)

In [47]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [48]:
df.filter('close < 500').select('Open').show()

+------------------+
|              Open|
+------------------+
|        213.429998|
|        214.599998|
|        214.379993|
|            211.75|
|        210.299994|
|212.79999700000002|
|209.18999499999998|
|        207.870005|
|210.11000299999998|
|210.92999500000002|
|        208.330002|
|        214.910006|
|        212.079994|
|206.78000600000001|
|202.51000200000001|
|205.95000100000001|
|        206.849995|
|        204.930004|
|        201.079996|
|192.36999699999998|
+------------------+
only showing top 20 rows



In [49]:
df.filter(df['Close'] > 50).select('Volume').show()

+---------+
|   Volume|
+---------+
|123432400|
|150476200|
|138040000|
|119282800|
|111902700|
|115557400|
|148614900|
|151473000|
|108223500|
|148516900|
|182501900|
|153038200|
|152038600|
|220441900|
|266424900|
|466777500|
|430642100|
|293375600|
|311488100|
|187469100|
+---------+
only showing top 20 rows



In [51]:
res = df.filter(df['Open'] > 500).collect()

In [56]:
res[0].asDict()

{'Date': datetime.datetime(2012, 2, 14, 0, 0),
 'Open': 504.659988,
 'High': 509.56002,
 'Low': 502.000008,
 'Close': 509.459991,
 'Volume': 115099600,
 'Adj Close': 66.005408}

In [58]:
sc.stop()

## Aggs

In [57]:
import findspark
findspark.init()

In [59]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName('aggsApp').setMaster('local')
sc = SparkContext(conf = conf)

In [60]:
from pyspark.sql import SparkSession

spark = SparkSession(sc).builder.appName('aggs').getOrCreate()

In [61]:
df = spark.read.csv('Resources/Python-and-Spark-for-Big-Data-master/Spark_DataFrames/sales_info.csv', inferSchema=True, header=True)

In [65]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [66]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)



In [72]:
df.groupBy('Company').max().show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



In [74]:
df.agg({'Sales':'max'}).show()

+----------+
|max(Sales)|
+----------+
|     870.0|
+----------+



In [75]:
group_data = df.groupBy('Company')

In [79]:
group_data.agg({'Sales':'max'}).show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



In [80]:
from pyspark.sql.functions import countDistinct,avg,stddev

In [84]:
df.select(countDistinct('Company').alias("Distinct Companies")).show()

+------------------+
|Distinct Companies|
+------------------+
|                 4|
+------------------+



In [85]:
from pyspark.sql.functions import format_number

In [88]:
std_sales_value = df.select(stddev('Sales').alias('s'))

std_sales_value.select(format_number('s',3).alias('std')).show()

+-------+
|    std|
+-------+
|250.087|
+-------+



In [89]:
sc.stop()

## Dem Recollections

In [1]:
import findspark
findspark.init()
findspark.find()

'D:\\Softwares\\Spark\\spark-2.4.7-bin-hadoop2.7'

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('appRecall').setMaster('local')
sc = SparkContext(conf=conf)

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession(sc).builder.appName('recaller').getOrCreate()

In [4]:
df = spark.read.csv("./Resources/Python-and-Spark-for-Big-Data-master/Spark_DataFrames/sales_info.csv", inferSchema=True, header=True)

In [7]:
df.head(2)

[Row(Company='GOOG', Person='Sam', Sales=200.0),
 Row(Company='GOOG', Person='Charlie', Sales=120.0)]

In [8]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [23]:
from pyspark.sql.functions import format_number, mean, countDistinct, min, max

df.groupBy("Company").max().show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+

