In [1]:
sc

In [2]:
from pyspark.sql.types import Row
from datetime import datetime 

In [8]:
#Parallelize function converts list object into spark RDD
simple_data = sc.parallelize([1,'Alice', 20])
simple_data

ParallelCollectionRDD[3] at parallelize at PythonRDD.scala:184

In [9]:
simple_data.count()

3

In [12]:
record = sc.parallelize([[1,'Alice',20], [2,'Bob', 21]])
record

ParallelCollectionRDD[7] at parallelize at PythonRDD.scala:184

In [13]:
record.collect()

[[1, 'Alice', 20], [2, 'Bob', 21]]

In [19]:
df = record.toDF()#converts structured RDD into dataframe by imposing schema on it
df 

DataFrame[_1: bigint, _2: string, _3: bigint]

In [20]:
df.show()

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 20|
|  2|  Bob| 21|
+---+-----+---+



In [23]:
rdd_w_fields = sc.parallelize([Row(id=1, name='Ankita',score=10)])
rdd_w_fields

ParallelCollectionRDD[29] at parallelize at PythonRDD.scala:184

In [26]:
df_w_fnames = rdd_w_fields.toDF()
df_w_fnames

DataFrame[id: bigint, name: string, score: bigint]

In [31]:
simple_rdd2 = sc.parallelize([Row(id=1, name='Anki', score=10),
                  Row(id=2, name='Avc', score=20)])
simple_rdd2

ParallelCollectionRDD[51] at parallelize at PythonRDD.scala:184

In [34]:
df2 = simple_rdd2.toDF()
df2.show()

+---+----+-----+
| id|name|score|
+---+----+-----+
|  1|Anki|   10|
|  2| Avc|   20|
+---+----+-----+



In [40]:
complex_rdd = sc.parallelize([
                    Row(id=1, name='Ankita', dic = {'a':1, 'b':2}, list_ = [1,2], col_r = Row(id=1,id2=2)),
                    Row(id=2, name='Ankit', dic = {'c':1, 'b':2}, list_ = [3,4],col_r = Row(id=1,id2=2)),
                    Row(id=3, name='Anki', dic = {'g':1, 'f':2}, list_ = [1,2], col_r = Row(id=1,id2=2))
                ])
complex_rdd

ParallelCollectionRDD[74] at parallelize at PythonRDD.scala:184

In [42]:
complex_df = complex_rdd.toDF()
complex_df.show()

+------+----------------+---+------+------+
| col_r|             dic| id| list_|  name|
+------+----------------+---+------+------+
|[1, 2]|[a -> 1, b -> 2]|  1|[1, 2]|Ankita|
|[1, 2]|[b -> 2, c -> 1]|  2|[3, 4]| Ankit|
|[1, 2]|[f -> 2, g -> 1]|  3|[1, 2]|  Anki|
+------+----------------+---+------+------+



In [43]:
complex_df

DataFrame[col_r: struct<id:bigint,id2:bigint>, dic: map<string,bigint>, id: bigint, list_: array<bigint>, name: string]

In [44]:
sqlc = SQLContext(sc)
sqlc

<pyspark.sql.context.SQLContext at 0x7fc7141f7208>

In [47]:
sql_df = sqlc.range(5)
sql_df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [48]:
sql_df.count()

5

In [57]:
#create dataframe using sqlContext for performing sql operations
data = [('anki', 10), ('ankit', 20), ('kit', 30)]
sql_df_w_schema = sqlc.createDataFrame(data)
sql_df_w_schema

DataFrame[_1: string, _2: bigint]

In [58]:
sql_df_w_schema.show()

+-----+---+
|   _1| _2|
+-----+---+
| anki| 10|
|ankit| 20|
|  kit| 30|
+-----+---+



In [59]:
#give field names to sql dataframe 
sql_df_fnames = sqlc.createDataFrame(data, ['name', 'score'])
sql_df_fnames

DataFrame[name: string, score: bigint]

In [60]:
sql_df_fnames.show()

+-----+-----+
| name|score|
+-----+-----+
| anki|   10|
|ankit|   20|
|  kit|   30|
+-----+-----+



In [63]:
complex_data = [
                    (1, 'Ankita', {'a':1, 'b':2}, [1,2], Row(id=1,id2=2)),
                    (2, 'Ankit', {'c':1, 'b':2}, [3,4], Row(id=1,id2=2)),
                    (3, 'Anki', {'g':1, 'f':2}, [1,2], Row(id=1,id2=2))
                ]

In [66]:
sql_complex_df = sqlc.createDataFrame(complex_data)
sql_complex_df.show()

+---+------+----------------+------+------+
| _1|    _2|              _3|    _4|    _5|
+---+------+----------------+------+------+
|  1|Ankita|[a -> 1, b -> 2]|[1, 2]|[1, 2]|
|  2| Ankit|[b -> 2, c -> 1]|[3, 4]|[1, 2]|
|  3|  Anki|[f -> 2, g -> 1]|[1, 2]|[1, 2]|
+---+------+----------------+------+------+



In [69]:
sql_complexdf_wnames = sqlc.createDataFrame(complex_data,['id', 'name', 'col_dict', 'col_list', 'col_row'] )

In [70]:
sql_complexdf_wnames.show()

+---+------+----------------+--------+-------+
| id|  name|        col_dict|col_list|col_row|
+---+------+----------------+--------+-------+
|  1|Ankita|[a -> 1, b -> 2]|  [1, 2]| [1, 2]|
|  2| Ankit|[b -> 2, c -> 1]|  [3, 4]| [1, 2]|
|  3|  Anki|[f -> 2, g -> 1]|  [1, 2]| [1, 2]|
+---+------+----------------+--------+-------+



In [72]:
data1 = sc.parallelize([
    Row(1,'ankita'),
    Row(2, 'bob'),
    Row(3, 'anki')
])
data1

ParallelCollectionRDD[191] at parallelize at PythonRDD.scala:184

In [75]:
column_names = Row('id', 'name')
students = data1.map(lambda r: column_names(*r))
students.collect()

[Row(id=1, name='ankita'), Row(id=2, name='bob'), Row(id=3, name='anki')]

In [77]:
df_students = sqlc.createDataFrame(students)
df_students.show()

+---+------+
| id|  name|
+---+------+
|  1|ankita|
|  2|   bob|
|  3|  anki|
+---+------+



In [80]:
df_students.collect()[0]

Row(id=1, name='ankita')

In [81]:
df_students.collect()[0][1]

'ankita'

In [83]:
df_students.collect()[2][1]

'anki'

In [85]:
#every dataframe has a rdd variable
df_students.rdd.collect()

[Row(id=1, name='ankita'), Row(id=2, name='bob'), Row(id=3, name='anki')]

In [88]:
#select only specific columns 
only_id_from_df = df_students.rdd.map(lambda x:(x.id))
only_id_from_df.collect()

[1, 2, 3]

In [90]:
#select specific columns from a dataframe
df_students.select('id').show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
+---+



In [91]:
#append a string to every string name 
df2.rdd.map(lambda x:(x.name + "add")).collect()

['Ankiadd', 'Avcadd']

In [96]:
#dataframes have no attribute map. Same results can be achieved using withColumn
df2.select('id','score').withColumn('sum_id_score', df2.id + df2.score).show()

+---+-----+------------+
| id|score|sum_id_score|
+---+-----+------------+
|  1|   10|          11|
|  2|   20|          22|
+---+-----+------------+



In [100]:
#a copy of dataframe with changes is shown. Original dataframe remains unchanged 
df2.withColumnRenamed('name', 'Name').show()

+---+----+-----+
| id|Name|score|
+---+----+-----+
|  1|Anki|   10|
|  2| Avc|   20|
+---+----+-----+



In [101]:
df2.show()

+---+----+-----+
| id|name|score|
+---+----+-----+
|  1|Anki|   10|
|  2| Avc|   20|
+---+----+-----+



In [103]:
#create dataframes to pandas dataframes
df_pandas = df2.toPandas()
df_pandas

Unnamed: 0,id,name,score
0,1,Anki,10
1,2,Avc,20


In [104]:
sqlc.createDataFrame(df_pandas).show()

+---+----+-----+
| id|name|score|
+---+----+-----+
|  1|Anki|   10|
|  2| Avc|   20|
+---+----+-----+

