In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import format_number

In [None]:
spark = SparkSession.builder.appName("JoinTables").master('local[*]').getOrCreate()

#### Importing Datasets

In [23]:
stocks = spark.read.csv("stocks_HadoopUC.txt", inferSchema=True, header=True, sep='\t')

In [24]:
stocks.show()

+--------+------+----------+-----+-----+-----+-----+-------+--------+----+
|Exchange|Symbol|      Data| Open| High|  Low|Close| Volumn|AdjClose| _c9|
+--------+------+----------+-----+-----+-----+-----+-------+--------+----+
|    LNSE|   CLI|2009-12-31|35.39| 35.7| 34.5|34.57| 890100|   34.12|null|
|    LNSE|   CLI|2009-12-30|35.22|35.46|34.96| 35.4| 516900|   34.94|null|
|    NYSE|   CLI|2009-12-29|35.69|35.95|35.21|35.34| 556500|   34.88|null|
|    NYSE|   CLI|2009-12-28|35.67|36.23|35.49|35.69| 565000|   35.23|null|
|    NYSE|   CLI|2009-12-24|35.38| 35.6|35.19|35.47| 230200|   35.01|null|
|    NYSE|   CLI|2009-12-23|35.13|35.51|35.07|35.21| 520200|   34.75|null|
|    NYSE|   CLI|2009-12-22|34.76|35.04|34.71|35.04| 564600|   34.58|null|
|    NYSE|   CLI|2009-12-21|34.65|34.74|34.41|34.73| 428400|   34.28|null|
|    NYSE|   CLI|2009-12-18|34.11|34.38|33.73|34.22|1152600|   33.77|null|
|    NYSE|   CLI|2009-12-17|34.18|34.53|33.84|34.21|1082600|   33.76|null|
|    NYSE|   CLI|2009-12-

In [25]:
dividends = spark.read.csv("dividends_HadoopUC.txt", inferSchema=True, header=True, sep='\t')

In [26]:
dividends.show()

+--------+------+----------+--------+
|Exchange|Symbol|      Data|Dividend|
+--------+------+----------+--------+
|    NYSE|   CPO|2009-12-30|    0.14|
|    NYSE|   CPO|2009-09-28|    0.14|
|    NYSE|   CPO|2009-06-26|    0.14|
|    NYSE|   CPO|2009-03-27|    0.14|
|    NYSE|   CPO|2009-01-06|    0.14|
|    NYSE|   CCS|2009-10-28|   0.414|
|    NYSE|   CCS|2009-07-29|   0.414|
|    NYSE|   CCS|2009-04-29|   0.414|
|    NYSE|   CCS|2009-01-28|   0.414|
|    NYSE|   CIF|2009-12-09|   0.029|
|    NYSE|   CIF|2009-11-10|   0.019|
|    NYSE|   CIF|2009-10-13|   0.019|
|    NYSE|   CIF|2009-09-10|   0.019|
|    NYSE|   CIF|2009-08-10|    0.02|
|    NYSE|   CIF|2009-07-13|    0.02|
|    NYSE|   CIF|2009-06-10|    0.02|
|    NYSE|   CIF|2009-05-11|   0.021|
|    NYSE|   CIF|2009-04-13|   0.022|
|    NYSE|   CIF|2009-03-09|   0.022|
|    NYSE|   CIF|2009-02-09|   0.022|
+--------+------+----------+--------+
only showing top 20 rows



#### Join two RDDs

In [27]:
df = stocks.join(dividends,"Symbol")

##### Remove repeated columns

In [28]:
repeated_columns = [c for c in stocks.columns if c in dividends.columns]

for col in repeated_columns:
    df = df.drop(dividends[col])

In [29]:
df.show()

+------+--------+----------+-----+-----+-----+-----+------+--------+----+--------+
|Symbol|Exchange|      Data| Open| High|  Low|Close|Volumn|AdjClose| _c9|Dividend|
+------+--------+----------+-----+-----+-----+-----+------+--------+----+--------+
|   CLI|    LNSE|2009-12-31|35.39| 35.7| 34.5|34.57|890100|   34.12|null|    0.64|
|   CLI|    LNSE|2009-12-31|35.39| 35.7| 34.5|34.57|890100|   34.12|null|    0.45|
|   CLI|    LNSE|2009-12-31|35.39| 35.7| 34.5|34.57|890100|   34.12|null|    0.45|
|   CLI|    LNSE|2009-12-31|35.39| 35.7| 34.5|34.57|890100|   34.12|null|    0.45|
|   CLI|    LNSE|2009-12-30|35.22|35.46|34.96| 35.4|516900|   34.94|null|    0.64|
|   CLI|    LNSE|2009-12-30|35.22|35.46|34.96| 35.4|516900|   34.94|null|    0.45|
|   CLI|    LNSE|2009-12-30|35.22|35.46|34.96| 35.4|516900|   34.94|null|    0.45|
|   CLI|    LNSE|2009-12-30|35.22|35.46|34.96| 35.4|516900|   34.94|null|    0.45|
|   CLI|    NYSE|2009-12-29|35.69|35.95|35.21|35.34|556500|   34.88|null|    0.64|
|   

#### Selecting required columns

In [30]:
final = df.select("Exchange" , "Symbol", "Dividend", "Close", "Volumn")

In [31]:
final.show()

+--------+------+--------+-----+------+
|Exchange|Symbol|Dividend|Close|Volumn|
+--------+------+--------+-----+------+
|    LNSE|   CLI|    0.64|34.57|890100|
|    LNSE|   CLI|    0.45|34.57|890100|
|    LNSE|   CLI|    0.45|34.57|890100|
|    LNSE|   CLI|    0.45|34.57|890100|
|    LNSE|   CLI|    0.64| 35.4|516900|
|    LNSE|   CLI|    0.45| 35.4|516900|
|    LNSE|   CLI|    0.45| 35.4|516900|
|    LNSE|   CLI|    0.45| 35.4|516900|
|    NYSE|   CLI|    0.64|35.34|556500|
|    NYSE|   CLI|    0.45|35.34|556500|
|    NYSE|   CLI|    0.45|35.34|556500|
|    NYSE|   CLI|    0.45|35.34|556500|
|    NYSE|   CLI|    0.64|35.69|565000|
|    NYSE|   CLI|    0.45|35.69|565000|
|    NYSE|   CLI|    0.45|35.69|565000|
|    NYSE|   CLI|    0.45|35.69|565000|
|    NYSE|   CLI|    0.64|35.47|230200|
|    NYSE|   CLI|    0.45|35.47|230200|
|    NYSE|   CLI|    0.45|35.47|230200|
|    NYSE|   CLI|    0.45|35.47|230200|
+--------+------+--------+-----+------+
only showing top 20 rows



#### Aggregating and Formatting according to requisition

In [32]:

final_groupped = final.groupBy("Exchange","Symbol").avg("Dividend","Close", "Volumn")

In [33]:
final_group = final_groupped.select(final_groupped["Exchange"],
                                    final_groupped["Symbol"],
                                    format_number(final_groupped["avg(Dividend)"].cast('float'),3).alias("AverageDividend"),
                                    format_number(final_groupped["avg(Close)"].cast('float'),3).alias("AverageClosingPrice"),
                                    format_number(final_groupped["avg(Volumn)"].cast('float'),3).alias("AverageVolume"))

In [34]:
final_group.show()

+--------+------+---------------+-------------------+--------------+
|Exchange|Symbol|AverageDividend|AverageClosingPrice| AverageVolume|
+--------+------+---------------+-------------------+--------------+
|    NYSE|   CWZ|          1.109|             17.655|     3,136.905|
|    NYSE|   CBK|          0.060|              5.925|   234,022.531|
|    NYSE|   CNX|          0.100|             37.358| 3,811,331.500|
|    NYSE|   CLB|          0.287|             88.926|   288,011.469|
|    NYSE|   CAG|          0.192|             19.259| 4,288,458.000|
|    NYSE|   CRT|          0.157|             27.017|    25,298.809|
|    NYSE|   COL|          0.240|             43.215| 1,305,699.250|
|    NYSE|   CMI|          0.175|             36.819| 3,068,422.500|
|    NYSE|  CATO|          0.165|             18.199|   276,529.750|
|    NYSE|   CIM|          0.108|              3.545| 7,106,761.000|
|    NYSE|   CRS|          0.180|             20.336|   650,623.000|
|    NYSE|   CYT|          0.041| 

#### Exporting data to csv

In [36]:
df_panda = final_group.toPandas()

In [37]:
df_panda.to_csv("stocks-dividends.csv", index=False)