In [2]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/evivancovid/warehouse"). \
    enableHiveSupport(). \
    appName(f'evivancovid | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

## Problem 1

In [2]:
df = spark.read.csv("/public/data/nyse_years/*", 
                   schema = "stockticker STRING, tradedate STRING, openprice FLOAT, highprice FLOAT, lowprice FLOAT, closeprice FLOAT, volume INT")

In [3]:
count_list =[(df.count(),)]

traded_stocks = spark.createDataFrame(count_list).toDF("traded_stocks")

In [4]:
type(traded_stocks)

pyspark.sql.dataframe.DataFrame

In [34]:
traded_stocks.coalesce(1).write.csv("/user/evivancovid/spark_practice/problem1/data/nyse_count")

AnalysisException: path hdfs://localhost:9000/user/evivancovid/spark_practice/problem1/data/nyse_count already exists.

In [35]:
%%sh

hdfs dfs -ls /user/evivancovid/spark_practice/problem1/data/nyse_count

Found 2 items
-rw-r--r--   1 evivancovid supergroup          0 2022-05-20 00:15 /user/evivancovid/spark_practice/problem1/data/nyse_count/_SUCCESS
-rw-r--r--   1 evivancovid supergroup          8 2022-05-20 00:15 /user/evivancovid/spark_practice/problem1/data/nyse_count/part-00000-3e7efb14-11ea-443d-847c-ddb7877fb5db-c000.csv


## Problem 2

In [36]:
p2DF = spark.read.csv("/public/data/nyse_years/NYSE_2010.txt", schema = "stockticker STRING, tradedate STRING, openprice FLOAT, highprice FLOAT, lowprice FLOAT, closeprice FLOAT, volume INT")

In [51]:
p2DF_stockticker = p2DF.select("stockticker").distinct().orderBy("stockticker")

In [54]:
p2DF_stockticker.coalesce(1).write.csv("/user/evivancovid/spark_practice/problem2/data/unique_stocks")

In [55]:
%%sh

hdfs dfs -ls /user/evivancovid/spark_practice/problem2/data/unique_stocks

Found 2 items
-rw-r--r--   1 evivancovid supergroup          0 2022-05-20 00:29 /user/evivancovid/spark_practice/problem2/data/unique_stocks/_SUCCESS
-rw-r--r--   1 evivancovid supergroup       8055 2022-05-20 00:29 /user/evivancovid/spark_practice/problem2/data/unique_stocks/part-00000-4785bccf-39b1-4895-bcf0-7a29fe11b435-c000.csv


## Problem 3

In [4]:
path = "/public/data/nyse_years/*"

In [7]:
nyse_data_all = spark.read.csv(path, schema = "stockticker STRING, tradedate INT, openprice FLOAT, highprice FLOAT, lowprice FLOAT, closeprice FLOAT, volume LONG")

In [8]:
nyse_data_all.show()

+-----------+---------+---------+---------+--------+----------+------+
|stockticker|tradedate|openprice|highprice|lowprice|closeprice|volume|
+-----------+---------+---------+---------+--------+----------+------+
|          A| 20160101|    41.81|    41.81|   41.81|     41.81|     0|
|         AA| 20160101|    29.61|    29.61|   29.61|     29.61|     0|
|        AAC| 20160101|    19.06|    19.06|   19.06|     19.06|     0|
|        AAN| 20160101|    22.39|    22.39|   22.39|     22.39|     0|
|        AAP| 20160101|   150.51|   150.51|  150.51|    150.51|     0|
|        AAT| 20160101|    38.35|    38.35|   38.35|     38.35|     0|
|        AAV| 20160101|     5.08|     5.08|    5.08|      5.08|     0|
|         AB| 20160101|    23.85|    23.85|   23.85|     23.85|     0|
|        ABB| 20160101|    17.73|    17.73|   17.73|     17.73|     0|
|       ABBV| 20160101|    59.24|    59.24|   59.24|     59.24|     0|
|        ABC| 20160101|   103.71|   103.71|  103.71|    103.71|     0|
|     

In [24]:
nyse_data_close = nyse_data_all.select("stockticker","closeprice")

In [29]:
nyse_data_close.repartition(8).write.json("/user/evivancovid/spark_practice/problem3/data/nyse_data_json")

In [30]:
%%sh

hdfs dfs -ls /user/evivancovid/spark_practice/problem3/data/nyse_data_json

Found 9 items
-rw-r--r--   1 evivancovid supergroup          0 2022-05-20 00:51 /user/evivancovid/spark_practice/problem3/data/nyse_data_json/_SUCCESS
-rw-r--r--   1 evivancovid supergroup   47976393 2022-05-20 00:51 /user/evivancovid/spark_practice/problem3/data/nyse_data_json/part-00000-11ab0989-8cd9-4bf4-bf92-1b82ece77196-c000.json
-rw-r--r--   1 evivancovid supergroup   47974676 2022-05-20 00:51 /user/evivancovid/spark_practice/problem3/data/nyse_data_json/part-00001-11ab0989-8cd9-4bf4-bf92-1b82ece77196-c000.json
-rw-r--r--   1 evivancovid supergroup   47975921 2022-05-20 00:51 /user/evivancovid/spark_practice/problem3/data/nyse_data_json/part-00002-11ab0989-8cd9-4bf4-bf92-1b82ece77196-c000.json
-rw-r--r--   1 evivancovid supergroup   47976439 2022-05-20 00:51 /user/evivancovid/spark_practice/problem3/data/nyse_data_json/part-00003-11ab0989-8cd9-4bf4-bf92-1b82ece77196-c000.json
-rw-r--r--   1 evivancovid supergroup   47975986 2022-05-20 00:51 /user/evivancovid/spark_practice/proble

## Problem 5

In [36]:
path = "/public/data/nyse_years/*"

path2 = "/public/data/nyse_stocks/*"

nyse_data_all = spark.read.csv(path, schema = "stockticker STRING, tradedate INT, openprice FLOAT, highprice FLOAT, lowprice FLOAT, closeprice FLOAT, volume LONG")
nyse_meta = spark.read.csv(path2, sep = "|", header = False, schema = "stockticker_meta STRING, stockname STRING, openprice FLOAT, volume_meta FLOAT, smthg1 FLOAT, smthg2  FLOAT, industry STRING, business STRING, webpage STRING")

In [37]:
nyse_joined = nyse_data_all.join(nyse_meta, on = nyse_data_all.stockticker == nyse_meta.stockticker_meta, how = "outer")

In [38]:
never_traded_stocks = nyse_joined.select("stockname, volume").distinct().where("volume == 0").orderBy("stockname")

In [39]:
never_traded_stocks

stockname
""
3D Systems Corpor...
3M Company
A.H. Belo Corpora...
"A10 Networks, Inc."
AAR Corp.
ABB Ltd
ABM Industries In...
AGCO Corporation
AK Steel Holding ...


In [40]:
never_traded_stocks.repartition(1).write.text("/user/evivancovid/spark_practice/problem5/data/untraded_stocks")

In [41]:
%%sh

hdfs dfs -ls /user/evivancovid/spark_practice/problem5/data/untraded_stocks

Found 2 items
-rw-r--r--   1 evivancovid supergroup          0 2022-05-24 14:41 /user/evivancovid/spark_practice/problem5/data/untraded_stocks/_SUCCESS
-rw-r--r--   1 evivancovid supergroup      54535 2022-05-24 14:41 /user/evivancovid/spark_practice/problem5/data/untraded_stocks/part-00000-68616906-131d-4d41-899e-e45dff38f6ad-c000.txt


## Problem 4

In [6]:
path = "/public/data/nyse_years/*"

path2 = "/public/data/nyse_stocks/*"

nyse_data_all = spark.read.csv(path, schema = "stockticker STRING, tradedate INT, openprice FLOAT, highprice FLOAT, lowprice FLOAT, closeprice FLOAT, volume LONG")
nyse_meta = spark.read.csv(path2, sep = "|", header = False, schema = "stockticker_meta STRING, stockname STRING, openprice FLOAT, volume FLOAT, smthg1 FLOAT, smthg2  FLOAT, industry STRING, business STRING, webpage STRING")

In [7]:
nyse_data_all.show()

In [8]:
nyse_joined = nyse_data_all.join(nyse_meta, on = nyse_data_all.stockticker == nyse_meta.stockticker_meta, how = "outer")

In [9]:
nyse_joined

stockticker,tradedate,openprice,highprice,lowprice,closeprice,volume,stockticker_meta,stockname,openprice.1,volume.1,smthg1,smthg2,industry,business,webpage
BOX,20130101,18.85,18.85,18.85,18.85,0,,,,,,,,,
BOX,20130102,19.22,19.37,19.1,19.24,87700,,,,,,,,,
BOX,20130103,19.27,19.27,19.0,19.12,36100,,,,,,,,,
BOX,20130104,19.12,19.35,19.07,19.16,47600,,,,,,,,,
BOX,20130107,19.14,19.28,19.0,19.04,63600,,,,,,,,,
BOX,20130108,19.15,19.23,19.0,19.17,53600,,,,,,,,,
BOX,20130109,19.28,19.57,19.1,19.42,76100,,,,,,,,,
BOX,20130110,19.58,19.61,19.31,19.48,73100,,,,,,,,,
BOX,20130111,19.52,19.64,19.43,19.55,71500,,,,,,,,,
BOX,20130114,19.55,19.62,19.36,19.48,44800,,,,,,,,,


In [11]:
no_stock_name = nyse_joined.select("stockticker").distinct().where("stockname is null").orderBy("stockticker")

In [12]:
no_stock_name

stockticker
AA-B
AAC
ABR-A
ABR-B
ABR-C
AC
ACV
ADNT
ADPT
ADSW


In [20]:
no_stock_name.repartition(1).write.text("/user/evivancovid/spark_practice/problem4/data/no_stock_names")

In [21]:
%%sh

hdfs dfs -ls /user/evivancovid/spark_practice/problem4/data/no_stock_names

Found 2 items
-rw-r--r--   1 evivancovid supergroup          0 2022-05-20 01:31 /user/evivancovid/spark_practice/problem4/data/no_stock_names/_SUCCESS
-rw-r--r--   1 evivancovid supergroup       4814 2022-05-20 01:31 /user/evivancovid/spark_practice/problem4/data/no_stock_names/part-00000-b31137c7-048f-47ea-9047-ed14f98f8c9b-c000.txt


In [24]:
help(nyse_data_all.distinct)

Help on method distinct in module pyspark.sql.dataframe:

distinct() method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
    
    .. versionadded:: 1.3.0
    
    Examples
    --------
    >>> df.distinct().count()
    2



## Problem 6

In [20]:
path = "/public/data/nyse_years/*"

path2 = "/public/data/nyse_stocks/*"

nyse_data_all = spark.read.csv(path, schema = "stockticker STRING, tradedate STRING, openprice STRING, highprice STRING, lowprice STRING, closeprice STRING, volume STRING")
nyse_meta = spark.read.csv(path2, sep = "|", header = False, schema = "stockticker_meta STRING, stockname STRING, open_price FLOAT, volume_meta FLOAT, smthg1 FLOAT, smthg2  FLOAT, industry STRING, business STRING, webpage STRING")

In [21]:
nyse_joined = nyse_data_all.join(nyse_meta, on = nyse_data_all.stockticker == nyse_meta.stockticker_meta, how = "outer")

In [57]:
from pyspark.sql.functions import expr, concat, lit

name_na_df = nyse_joined.select("*"). \
    withColumn('stockname', expr("nvl(nullif(stockname, ''), 'Stock Name Not Available')")). \
    drop("stockticker_meta", "open_price", "volume_meta", "smthg1", "smthg2", "industry", "business", "webpage"). \
    orderBy("tradedate", "stockticker")

In [59]:
name_na_df.select(concat("stockticker", lit(";"),"tradedate", lit(";"),"openprice", lit(";"),"highprice", lit(";"),"lowprice", lit(";"),"closeprice", lit(";"),"volume", lit(";"),"stockname")).collect().toDF("name_na")

Py4JJavaError: An error occurred while calling o361.collectToPython.
: java.lang.OutOfMemoryError: GC overhead limit exceeded
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.next(SparkPlan.scala:375)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.next(SparkPlan.scala:369)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.foreach(SparkPlan.scala:369)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollect$1(SparkPlan.scala:391)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollect$1$adapted(SparkPlan.scala:390)
	at org.apache.spark.sql.execution.SparkPlan$$Lambda$3119/21514368.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:390)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3519)
	at org.apache.spark.sql.Dataset$$Lambda$3075/1391935165.apply(Unknown Source)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.Dataset$$Lambda$1641/1433871220.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$1649/1497936359.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$1642/2093425166.apply(Unknown Source)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3516)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)


In [53]:
name_na

AttributeError: 'DataFrameWriter' object has no attribute 'toDF'

In [72]:
name_not_available.repartition(8).write.text("/user/evivancovid/spark_practice/problem6/data/stock_data_with_names", lineSep = ";")

AnalysisException: Text data source supports only a single column, and you have 8 columns.

In [58]:
help(df.orderBy)

Help on method sort in module pyspark.sql.dataframe:

sort(*cols, **kwargs) method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` sorted by the specified column(s).
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : str, list, or :class:`Column`, optional
         list of :class:`Column` or column names to sort by.
    
    Other Parameters
    ----------------
    ascending : bool or list, optional
        boolean or list of boolean (default ``True``).
        Sort ascending vs. descending. Specify list for multiple sort orders.
        If a list is specified, length of the list must equal length of the `cols`.
    
    Examples
    --------
    >>> df.sort(df.age.desc()).collect()
    [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
    >>> df.sort("age", ascending=False).collect()
    [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
    >>> df.orderBy(df.age.desc()).collect()
    [Row(age=5, name='Bob'), Row(age=2, na

## Problem 7

In [3]:
path = "/public/data/nyse_years/*"

nyse_data_all = spark.read.csv(path, schema = "stockticker STRING, tradedate INT, openprice FLOAT, highprice FLOAT, lowprice FLOAT, closeprice FLOAT, volume LONG")

In [4]:
spark.sql("""USE evivancovid_nyvenv""")

In [5]:
from pyspark.sql.functions import col
from pyspark.sql.avro.functions import from_avro, to_avro

sorted_data_nyse = nyse_data_all.orderBy(col("tradedate"), col("volume").desc())

In [6]:
sorted_data_nyse.repartition(4).write.format("avro").saveAsTable("nyse_data_avro", mode = "overwrite")

AnalysisException: Failed to find data source: avro. Avro is built-in but external data source module since Spark 2.4. Please deploy the application as per the deployment section of "Apache Avro Data Source Guide".

In [10]:
help(to_avro)

Help on function to_avro in module pyspark.sql.avro.functions:

to_avro(data, jsonFormatSchema='')
    Converts a column into binary of avro format.
    
    .. versionadded:: 3.0.0
    
    Parameters
    ----------
    data : :class:`~pyspark.sql.Column` or str
        the data column.
    jsonFormatSchema : str, optional
        user-specified output avro schema in JSON string format.
    
    Notes
    -----
    Avro is built-in but external data source module since Spark 2.4. Please deploy the
    application as per the deployment section of "Apache Avro Data Source Guide".
    
    Examples
    --------
    >>> from pyspark.sql import Row
    >>> from pyspark.sql.avro.functions import to_avro
    >>> data = ['SPADES']
    >>> df = spark.createDataFrame(data, "string")
    >>> df.select(to_avro(df.value).alias("suite")).collect()
    [Row(suite=bytearray(b'\x00\x0cSPADES'))]
    
    >>> jsonFormatSchema = '''["null", {"type": "enum", "name": "value",
    ...     "symbols": ["SPAD

Object `cast` not found.
