Connecting to a cluster

In [None]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('hello').setMaster('spark://169.92.120.134:7077')
sc = SparkContext(conf=conf)

Reading data into RDD

In [None]:
sc.textFile("mydata.csv")

Creating a SparkSession object to work with dataframes

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()  

Reading data into a dataframe

In [None]:
mydata_csv = spark.read.format('com.databricks.spark.csv')\
              .options(header='true', delimiter=',')\
              .load('hdfs:///opt/kris/mydata.csv')
mydata_parquet = spark.read.parquet('hdfs:///opt/kris/mydata.parquet')
mydata_db = spark.read.jdbc(url=url,table='testdb.employee',properties=db_properties)

Creating test dataframe

In [None]:
sqlContext = SQLContext(sc)

columns = ['employee_name', 'is_chief', 'current_salary', 'desired_salary']
vals = [
     ('Christina', False, 100, 1000),
     ('Olya', False, 100, 500),
     ('Polina', False, 100, 300),
     ('Misha', False, 100, None),
     ('Mikalai', True, 10000, 20000),
     ('Artyom', False, 100, 600)
]

salary_wish_df = spark.createDataFrame(vals, columns)

Manipulating data - built-in functions

In [None]:
salary_delta = salary_wish_df.filter(col('desired_salary').isNotNull())\
                             .withColumn('delta', when(not col('is_chief'), col('desired_salary') - col('current_salary')))

delta_sum = salary_delta.groupBy(col('employee_name')).sum('delta').collect()[0][0]
salary_delta_all = salary_delta.withColumn('delta', when(col('is_chief'), -delta_sum)
                                                    .otherwise(col('delta')))
final_salary = salary_delta_all.withColumn('final_salary', col('current_salary') + col('delta'))
final_salary.show()

Manipulating data - sql query

In [None]:
salary_wish_df.registerTempTable("salary_wish_df")
salary_delta = sqlContext.sql("select df.*, (select desired_salary-current_salary\
                                             from salary_wish_df dfnc where is_chief is false\
                                             and dfnc.employee_name=df.employee_name) as delta\
                              from salary_wish_df as df")
salary_delta.createOrReplaceTempView("salary_delta")
delta_sum = sqlContext.sql("select sum(delta) from salary_delta").collect()[0][0]
salary_delta_all = sqlContext.sql("select df.*, case \
                                                when is_chief is true then " + -delta_sum + ",\
                                                else delta\
                                                end delta\
                                                from salary_delta")
salary_delta_all.createOrReplaceTempView("salary_delta_all")
final_salary = sqlContext.sql("select df.*, current_salary + delta as final_salary from salary_delta_all")
final_salary.show()                              

Configuring Spark

In [None]:
conf = SparkConf()
conf.set("spark.app.name", "New advanced name")
conf.set("spark.ui.port", "5001")
sc = SparcContext(conf=conf)

Configuring Spark in the runtime

In [None]:
spark-submit --master ip --executor-cores=3 --diver 8G sample.py

Caching datasets

In [None]:
salary_delta = salary_wish_df.filter(col('desired_salary').isNotNull())\
                             .withColumn('delta', when(not col('is_chief'), col('desired_salary') - col('current_salary')))

salary_delta.cache()

Changing memory fraction

In [None]:
conf = SparkConf()
conf.set("spark.storage.memoryFraction", "0.5")
conf.set("spark.shuffle.memoryFraction", "0.2")
sc = SparcContext(conf=conf)

Changing number of partitions

In [None]:
rdd_reduced = rdd.reduceByKey(_ + _, numPartitions = 5)

In [None]:
filtered_data = mydata_csv.filter(lambda line: line['col1'] == "value1")
filtered_data.coalesce(5).cache()
filtered_data.partitions().size()

In [1]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [4]:
conf = SparkConf()
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()  