#### Importing Libraries

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import format_number

#### Starting Spark Session

In [2]:
spark = SparkSession.builder.appName("avg_wights_of_American_cars").master('local[*]').getOrCreate()

In [3]:
data = spark.read.csv("./data/cars.tsv", sep='\t', inferSchema=True, header=True)

In [4]:
data.show()

+---------+--------------------+---+---------+-----------+----------+------+----------+----+--------+
|     make|               Model|MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+---------+--------------------+---+---------+-----------+----------+------+----------+----+--------+
|      amc|  amc ambassador dpl| 15|        8|        390|       190|  3850|       8.5|  70|American|
|      amc|         amc gremlin| 21|        6|        199|        90|  2648|      15.0|  70|American|
|      amc|          amc hornet| 18|        6|        199|        97|  2774|      15.5|  70|American|
|      amc|       amc rebel sst| 16|        8|        304|       150|  3433|      12.0|  70|American|
|    buick|buick estate wago...| 14|        8|        455|       225|  3086|      10.0|  70|American|
|    buick|   buick skylark 320| 15|        8|        350|       165|  3693|      11.5|  70|American|
|chevrolet|chevrolet chevell...| 18|        8|        307|       130|  3504|      

#### Filtering only American Cars

In [8]:
american_cars = data.filter(data.Origin == "American")

In [23]:
american_cars.show(truncate=False)

+---------+-------------------------+---+---------+-----------+----------+------+----------+----+--------+
|make     |Model                    |MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|Origin  |
+---------+-------------------------+---+---------+-----------+----------+------+----------+----+--------+
|amc      |amc ambassador dpl       |15 |8        |390        |190       |3850  |8.5       |70  |American|
|amc      |amc gremlin              |21 |6        |199        |90        |2648  |15.0      |70  |American|
|amc      |amc hornet               |18 |6        |199        |97        |2774  |15.5      |70  |American|
|amc      |amc rebel sst            |16 |8        |304        |150       |3433  |12.0      |70  |American|
|buick    |buick estate wagon (sw)  |14 |8        |455        |225       |3086  |10.0      |70  |American|
|buick    |buick skylark 320        |15 |8        |350        |165       |3693  |11.5      |70  |American|
|chevrolet|chevrolet chevelle malibu|

#### Average weight of each brands

In [20]:
avg_weight_by_brand = american_cars.groupBy("make").avg("Weight")

In [21]:
avg_weight_by_brand = avg_weight_by_brand.select(avg_weight_by_brand["make"],
                           format_number(avg_weight_by_brand["avg(Weight)"].cast('int'),0).alias("Average Weight")
                           )

In [22]:
avg_weight_by_brand.show()

+---------+--------------+
|     make|Average Weight|
+---------+--------------+
|    buick|         3,760|
|  pontiac|         4,327|
|      amc|         3,239|
| plymouth|         3,382|
|    chevy|         4,376|
|    dodge|         4,300|
|     ford|         3,791|
|chevrolet|         3,460|
|  mercury|         2,220|
|       hi|         4,732|
+---------+--------------+



## Using RDD

In [28]:
rdd = data.rdd

In [29]:
rdd.collect()

[Row(make='amc', Model='amc ambassador dpl', MPG=15, Cylinders=8, Engine Disp=390, Horsepower=190, Weight=3850, Accelerate=8.5, Year=70, Origin='American'),
 Row(make='amc', Model='amc gremlin', MPG=21, Cylinders=6, Engine Disp=199, Horsepower=90, Weight=2648, Accelerate=15.0, Year=70, Origin='American'),
 Row(make='amc', Model='amc hornet', MPG=18, Cylinders=6, Engine Disp=199, Horsepower=97, Weight=2774, Accelerate=15.5, Year=70, Origin='American'),
 Row(make='amc', Model='amc rebel sst', MPG=16, Cylinders=8, Engine Disp=304, Horsepower=150, Weight=3433, Accelerate=12.0, Year=70, Origin='American'),
 Row(make='buick', Model='buick estate wagon (sw)', MPG=14, Cylinders=8, Engine Disp=455, Horsepower=225, Weight=3086, Accelerate=10.0, Year=70, Origin='American'),
 Row(make='buick', Model='buick skylark 320', MPG=15, Cylinders=8, Engine Disp=350, Horsepower=165, Weight=3693, Accelerate=11.5, Year=70, Origin='American'),
 Row(make='chevrolet', Model='chevrolet chevelle malibu', MPG=18, C

In [30]:
americanCars = rdd.filter(lambda row : row["Origin"]=='American')

In [31]:
americanCars.collect()

[Row(make='amc', Model='amc ambassador dpl', MPG=15, Cylinders=8, Engine Disp=390, Horsepower=190, Weight=3850, Accelerate=8.5, Year=70, Origin='American'),
 Row(make='amc', Model='amc gremlin', MPG=21, Cylinders=6, Engine Disp=199, Horsepower=90, Weight=2648, Accelerate=15.0, Year=70, Origin='American'),
 Row(make='amc', Model='amc hornet', MPG=18, Cylinders=6, Engine Disp=199, Horsepower=97, Weight=2774, Accelerate=15.5, Year=70, Origin='American'),
 Row(make='amc', Model='amc rebel sst', MPG=16, Cylinders=8, Engine Disp=304, Horsepower=150, Weight=3433, Accelerate=12.0, Year=70, Origin='American'),
 Row(make='buick', Model='buick estate wagon (sw)', MPG=14, Cylinders=8, Engine Disp=455, Horsepower=225, Weight=3086, Accelerate=10.0, Year=70, Origin='American'),
 Row(make='buick', Model='buick skylark 320', MPG=15, Cylinders=8, Engine Disp=350, Horsepower=165, Weight=3693, Accelerate=11.5, Year=70, Origin='American'),
 Row(make='chevrolet', Model='chevrolet chevelle malibu', MPG=18, C

In [32]:
weightByBrand = americanCars.groupBy(lambda row : row["make"])

In [34]:
weightByBrand.collect()

[('amc', <pyspark.resultiterable.ResultIterable at 0x1d5384a9670>),
 ('buick', <pyspark.resultiterable.ResultIterable at 0x1d5384a9f40>),
 ('chevrolet', <pyspark.resultiterable.ResultIterable at 0x1d5384a9e80>),
 ('chevy', <pyspark.resultiterable.ResultIterable at 0x1d5384a9a90>),
 ('dodge', <pyspark.resultiterable.ResultIterable at 0x1d5384a95e0>),
 ('ford', <pyspark.resultiterable.ResultIterable at 0x1d536c21b80>),
 ('hi', <pyspark.resultiterable.ResultIterable at 0x1d538457cd0>),
 ('plymouth', <pyspark.resultiterable.ResultIterable at 0x1d538457430>),
 ('pontiac', <pyspark.resultiterable.ResultIterable at 0x1d538457820>),
 ('mercury', <pyspark.resultiterable.ResultIterable at 0x1d538457910>)]

In [57]:
from numpy import mean

avgWeightByBrand = weightByBrand.map(lambda x:(x[0], int(mean([y["Weight"] for y in list(x[1])]))))

In [59]:
avgWeightByBrand.collect()

[('amc', 3239),
 ('buick', 3760),
 ('chevrolet', 3460),
 ('chevy', 4376),
 ('dodge', 4300),
 ('ford', 3791),
 ('hi', 4732),
 ('plymouth', 3382),
 ('pontiac', 4327),
 ('mercury', 2220)]

#### ending session

In [60]:
spark.stop()