In [1]:
!pip install pyspark

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName('PySpark').getOrCreate()

In [8]:
spark

In [9]:
#spark = SparkSession.builder.setMaster('spark://207.234.43.54:7077').appName('PySpark').getOrCreate()

In [10]:
sc = spark.sparkContext

In [11]:
sc

In [18]:
data = range(0, 1000)

In [19]:
rdd = sc.parallelize(data)

In [20]:
type(rdd)

pyspark.rdd.PipelinedRDD

In [21]:
list = rdd.collect()

In [22]:
type(list)

list

In [39]:
rdd.filter(lambda x: x ** 2 > 1000).map(lambda x: x + 100).sum()

595804

In [45]:
rdd.union(rdd).count()

2000

In [47]:
rdd.saveAsTextFile('data')

In [48]:
rdd.getNumPartitions()

8

In [49]:
rdd4 = rdd.repartition(4)

In [50]:
rdd4.getNumPartitions()

4

In [51]:
rdd4.saveAsTextFile('data4')

In [52]:
rdd.coalesce(1).saveAsTextFile('data1')

In [53]:
sc.defaultParallelism

8

# RDD to DataFrame

In [62]:
persons = [(1, 'Alex'), (2, 'Anna'), (3, 'Denis'), (4, 'Alex')]

In [63]:
rdd = sc.parallelize(persons)

In [64]:
df = rdd.toDF(['id', 'name'])

In [65]:
type(df)

pyspark.sql.dataframe.DataFrame

In [70]:
df.show(2)

+---+----+
| id|name|
+---+----+
|  1|Alex|
|  2|Anna|
+---+----+
only showing top 2 rows



In [76]:
type(df.rdd)

pyspark.rdd.RDD

In [78]:
df.schema

StructType(List(StructField(id,LongType,true),StructField(name,StringType,true)))

In [79]:
df.columns

['id', 'name']

In [80]:
df.dtypes

[('id', 'bigint'), ('name', 'string')]

In [81]:
df.describe().show()

+-------+------------------+-----+
|summary|                id| name|
+-------+------------------+-----+
|  count|                 4|    4|
|   mean|               2.5| null|
| stddev|1.2909944487358056| null|
|    min|                 1| Alex|
|    max|                 4|Denis|
+-------+------------------+-----+



In [82]:
df.describe(['id']).show()

+-------+------------------+
|summary|                id|
+-------+------------------+
|  count|                 4|
|   mean|               2.5|
| stddev|1.2909944487358056|
|    min|                 1|
|    max|                 4|
+-------+------------------+



In [83]:
df[['name']].show()

+-----+
| name|
+-----+
| Alex|
| Anna|
|Denis|
| Alex|
+-----+



In [84]:
df.groupBy('name').count().show()

+-----+-----+
| name|count|
+-----+-----+
| Alex|    2|
| Anna|    1|
|Denis|    1|
+-----+-----+



# Добавить колонку?

In [86]:
from pyspark.sql.functions import split,col

In [87]:
ndf = df.withColumn('name_alias', split(col('name'), '')[0])

In [88]:
ndf.show()

+---+-----+----------+
| id| name|name_alias|
+---+-----+----------+
|  1| Alex|         A|
|  2| Anna|         A|
|  3|Denis|         D|
|  4| Alex|         A|
+---+-----+----------+



In [89]:
ndf.drop('name_alias').show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+



In [90]:
ndf.show()

+---+-----+----------+
| id| name|name_alias|
+---+-----+----------+
|  1| Alex|         A|
|  2| Anna|         A|
|  3|Denis|         D|
|  4| Alex|         A|
+---+-----+----------+



In [93]:
ndf = df.withColumn('age', col('id')*10)

In [94]:
ndf.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1| Alex| 10|
|  2| Anna| 20|
|  3|Denis| 30|
|  4| Alex| 40|
+---+-----+---+



In [95]:
ndf.corr('age', 'id')

1.0

In [103]:
ndf[['age','id']].collect()[0].id

1

# Cохранение и загрузка

In [108]:
df.coalesce(1).write.csv('persons_csv')

In [109]:
df.coalesce(1).write.json('persons_json')

In [110]:
df.coalesce(1).write.parquet('persons_parquet')

In [111]:
spark.read.parquet('persons_parquet').show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+



# Spark SQL

In [119]:
df.show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+



In [120]:
df.createOrReplaceTempView('people')

In [122]:
spark.sql('select name from people where id <= 2').show()

+----+
|name|
+----+
|Alex|
|Anna|
+----+



# PySpark & Pandas

In [125]:
df.show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+



In [126]:
pdf = df.toPandas()

In [127]:
type(pdf)

pandas.core.frame.DataFrame

In [128]:
pdf.head()

Unnamed: 0,id,name
0,1,Alex
1,2,Anna
2,3,Denis
3,4,Alex


In [129]:
df_from_pandas = spark.createDataFrame(pdf)

In [130]:
type(df_from_pandas)

pyspark.sql.dataframe.DataFrame

In [131]:
df_from_pandas.show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+

