In [2]:
import findspark

In [3]:
findspark.init()

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark=SparkSession.builder.appName('Basic data frames').getOrCreate()

In [6]:
spark.read.format('csv').load('IRIS.csv')

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string]

In [7]:
df=spark.read.format('csv').load('IRIS.csv')
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)



In [8]:
df.show()

+------------+-----------+------------+-----------+-----------+
|         _c0|        _c1|         _c2|        _c3|        _c4|
+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|          3|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|           5|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|           5|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6| 

In [9]:
# creating schema
from pyspark.sql.types import StructType,StructField,StringType,LongType,FloatType
from pyspark.sql import Row
myRow=Row(Name='BDA',count=1)

In [10]:
myschema=StructType([StructField('Name',StringType(),True),StructField('REGNumber',LongType(),True),StructField('CGPA',FloatType(),True)])

In [14]:
myRow=Row('Abc',191046030,9.0)
myDF=spark.createDataFrame([myRow],myschema)

In [15]:
myDF.printSchema()

root
 |-- Name: string (nullable = true)
 |-- REGNumber: long (nullable = true)
 |-- CGPA: float (nullable = true)



In [16]:
from pyspark.sql.functions import col,expr,column,udf

In [17]:
df=spark.read.format('csv')\
    .option('header','true')\
    .option('inferSchema','true')\
    .load('2015-summary.csv')

In [18]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [23]:
df.show(5,False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India              |62   |
+-----------------+-------------------+-----+
only showing top 5 rows



In [24]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [27]:
df.select(expr('DEST_COUNTRY_NAME as df')).show(5,False)

+-------------+
|df           |
+-------------+
|United States|
|United States|
|United States|
|Egypt        |
|United States|
+-------------+
only showing top 5 rows



In [26]:
df.select("count").show(5,False)

+-----+
|count|
+-----+
|15   |
|1    |
|344  |
|15   |
|62   |
+-----+
only showing top 5 rows



In [28]:
df.select(col('DEST_COUNTRY_NAME')).show(5,False)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|United States    |
|United States    |
|United States    |
|Egypt            |
|United States    |
+-----------------+
only showing top 5 rows



In [29]:
df.select('DEST_COUNTRY_NAME').show(5,False)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|United States    |
|United States    |
|United States    |
|Egypt            |
|United States    |
+-----------------+
only showing top 5 rows



In [30]:
df.select('DEST_COUNTRY_NAME','ORIGIN_COUNTRY_NAME','count').show(5,False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India              |62   |
+-----------------+-------------------+-----+
only showing top 5 rows



In [31]:
df.createOrReplaceTempView('dfTable')

In [32]:
spark.sql('Select DEST_COUNTRY_NAME as Destination from dfTable LIMIT 5').show()

+-------------+
|  Destination|
+-------------+
|United States|
|United States|
|United States|
|        Egypt|
|United States|
+-------------+



In [33]:
df.selectExpr('*','(DEST_COUNTRY_NAME=ORIGIN_COUNTRY_NAME) as withinCountry').show(15,False)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|United States    |Romania            |15   |false        |
|United States    |Croatia            |1    |false        |
|United States    |Ireland            |344  |false        |
|Egypt            |United States      |15   |false        |
|United States    |India              |62   |false        |
|United States    |Singapore          |1    |false        |
|United States    |Grenada            |62   |false        |
|Costa Rica       |United States      |588  |false        |
|Senegal          |United States      |40   |false        |
|Moldova          |United States      |1    |false        |
|United States    |Sint Maarten       |325  |false        |
|United States    |Marshall Islands   |39   |false        |
|Guyana           |United States      |64   |false        |
|Malta            |United States      |1

In [34]:
from pyspark.sql.types import StringType,IntegerType

In [35]:
df.withColumn('withinCountry',expr('ORIGIN_COUNTRY_NAME==DEST_COUNTRY_NAME')).show(5,False)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|United States    |Romania            |15   |false        |
|United States    |Croatia            |1    |false        |
|United States    |Ireland            |344  |false        |
|Egypt            |United States      |15   |false        |
|United States    |India              |62   |false        |
+-----------------+-------------------+-----+-------------+
only showing top 5 rows

