# Create SparkContext, SparkSession instances

In [1]:
!pip install pyspark

import pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 59.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=b0d8f14f95f982b48cea42961cc8d0116a35fcb1a2db02f66aac172639ac5049
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
from pyspark import SparkConf, SparkContext

from pyspark.sql import SparkSession

In [3]:
sc = SparkContext(conf=SparkConf())

spark = SparkSession(sparkContext=sc)

# Read tabular data

In [4]:
mtcars = spark.read.csv('/content/mtcars.csv', 
                        header=True, 
                        inferSchema=True)

In [6]:
mtcars.show()

+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|          Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|      Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|     Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
|            Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|
|           Merc 280|19.2|  6|167.6|123|3.92| 3.44| 18.3|  1|  0|   4|   4|
|          M

In [None]:
mtcars.head(5)

In [None]:
mtcars.show(3)

## Rename individual column

In [None]:
mtcars = mtcars.withColumnRenamed('_c0', 'rown_ames')
mtcars.show(3)

## Rename multple columns

In [None]:
new_col_names = [ 'x_' + x for x in mtcars.columns]
new_col_names

In [None]:
mtcars = mtcars.rdd.toDF(new_col_names)
mtcars.show(3)

## Read non-tabular data

In [None]:
amazon = sc.textFile('/content/amazon.txt')

amazon.take(5)

# Export data

In [None]:
from pyspark.sql import DataFrameWriter

Before we write the data into a file, we need to coalesce the data into one sinle partition. Otherwise, there will be multiple output files.

In [None]:
mtcars = mtcars.coalesce(numPartitions=1)

In [7]:
mtcars.write.csv('/content/saved-mtcars', 
                 header=True)

In [None]:
twitter = twitter.coalesce(numPartitions=1)

In [None]:
twitter.saveAsTextFile('data/saved-twitter')