In [21]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from py4j.java_gateway import java_import


spark = SparkSession.builder.appName("ReadwriteVal").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


In [22]:
path = "../datasets/"

students = spark.read.csv(path+'students.csv', inferSchema=True, header=True)

In [23]:
students.limit(4).toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44


In [24]:
parquet = spark.read.parquet(path+"users*")

In [25]:
parquet.limit(4).toPandas()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
0,2016-02-03 05:55:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
1,2016-02-03 15:04:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,
2,2016-02-02 23:09:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,
3,2016-02-02 22:36:21,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,


In [26]:
users1_2 = spark.read.parquet(path+'users1.parquet', path+'users2.parquet')
users1_2.limit(4).toPandas()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
0,2016-02-03 05:55:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
1,2016-02-03 15:04:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,
2,2016-02-02 23:09:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,
3,2016-02-02 22:36:21,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,


In [27]:
students.printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: integer (nullable = true)
 |-- reading score: integer (nullable = true)
 |-- writing score: integer (nullable = true)



In [28]:
students.columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

In [29]:
students.describe()

DataFrame[summary: string, gender: string, race/ethnicity: string, parental level of education: string, lunch: string, test preparation course: string, math score: string, reading score: string, writing score: string]

In [30]:
students.schema['math score'].dataType

IntegerType

In [31]:
students.select("math score", "reading score").summary("count","min", "max").show()

+-------+----------+-------------+
|summary|math score|reading score|
+-------+----------+-------------+
|  count|      1000|         1000|
|    min|         0|           17|
|    max|       100|          100|
+-------+----------+-------------+



## HOW TO SPECIFY DATA TYPES

In [32]:
dataSchema = [StructField("name", StringType(), True),
              StructField("email", StringType(), True),
              StructField("city", StringType(), True),
              StructField("mac", StringType(), True),
              StructField("timestamp", StringType(), True),
              StructField("creditcard", StringType(), True)]

In [33]:
final_struc = StructType(fields =  dataSchema)

In [34]:
people = spark.read.json(path+'people.json', schema=final_struc)

In [35]:
people.limit(4).toPandas()

Unnamed: 0,name,email,city,mac,timestamp,creditcard
0,,,,,,
1,Keeley Bosco,katlyn@jenkinsmaggio.net,Lake Gladysberg,08:fd:0b:cd:77:f7,2015-04-25 13:57:36 +0700,1228-1221-1221-1431
2,Rubye Jerde,juvenal@johnston.name,,90:4d:fa:42:63:a2,2015-04-25 09:02:04 +0700,1228-1221-1221-1431
3,Miss Darian Breitenberg,,,f9:0e:d3:40:cb:e9,2015-04-25 13:16:03 +0700,


In [36]:
people.printSchema()

root
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- mac: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- creditcard: string (nullable = true)



## Writing in Data 

In [37]:
students.write.mode("overwrite").csv('write_test.csv')

In [38]:
# java_import(spark._jvm, "org.apache.hadoop.Path")

# fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
# file = fs.globStatus(spark._jvm.Path('write_test.csv/part*'))[0].getPath().getName()
# fs.rename(spark._jvm.Path('write_test.csv/'+file), spark._jvm.Path('write_test2.csv'))
# fs.delete(spark._jvm.Path('write_test.csv/'), True)

In [39]:
users1_2.write.mode("overwrite").parquet('parquet/')

In [40]:
users1_2.write.partitionBy("gender").parquet('part_parquet/')

                                                                                

In [41]:
values = [('Pear',10),('Orange',13), ('Peach',5)]
df = spark.createDataFrame(values,['fruit','quant'])
df.show()

+------+-----+
| fruit|quant|
+------+-----+
|  Pear|   10|
|Orange|   13|
| Peach|    5|
+------+-----+

