## Starting a simple spark session

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('test').getOrCreate()

## Loading the csv file to spark

In [2]:
df = spark.read.csv("walmart_stock.csv", header=True, inferSchema = True)
df.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+----------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



## Count and column names of the dataframe

In [3]:
print ("The number of records in dataframe are:",df.count())
print ("The columns of the dataframe are      :",df.columns)

The number of records in dataframe are: 1258
The columns of the dataframe are      : ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']


## Schema of the dataframe

In [4]:
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



## Displaying first 5 rows and 5 columns

In [5]:
df_cols = df.columns
cols=df_cols[0:5]
df.select(['Date','Open','High','Low','Close']).show(5)

+----------+------------------+---------+---------+------------------+
|      Date|              Open|     High|      Low|             Close|
+----------+------------------+---------+---------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18|
+----------+------------------+---------+---------+------------------+
only showing top 5 rows



In [26]:
df.select(['Date','Open','High','Low','Close']).collect()[0:5]

[Row(Date=datetime.date(2012, 1, 3), Open=59.970001, High=61.060001, Low=59.869999, Close=60.330002),
 Row(Date=datetime.date(2012, 1, 4), Open=60.209998999999996, High=60.349998, Low=59.470001, Close=59.709998999999996),
 Row(Date=datetime.date(2012, 1, 5), Open=59.349998, High=59.619999, Low=58.369999, Close=59.419998),
 Row(Date=datetime.date(2012, 1, 6), Open=59.419998, High=59.450001, Low=58.869999, Close=59.0),
 Row(Date=datetime.date(2012, 1, 9), Open=59.029999, High=59.549999, Low=58.919998, Close=59.18)]

## Type casting to Integer 

In [27]:
df.describe()

DataFrame[summary: string, Open: string, High: string, Low: string, Close: string, Volume: string, Adj Close: string]

In [31]:
from pyspark.sql.functions import format_number

result = df.describe()
result

result.select(result['summary'],
             format_number(result['Open'].cast('float'),2).alias('Open'),
             format_number(result['High'].cast('float'),2).alias('High'),
             format_number(result['Low'].cast('float'),2).alias('Low'),
             format_number(result['Close'].cast('float'),2).alias('Close'),
             format_number(result['Volume'].cast('float'),2).alias('Volume')).show()


+-------+--------+--------+--------+--------+-------------+
|summary|    Open|    High|     Low|   Close|       Volume|
+-------+--------+--------+--------+--------+-------------+
|  count|1,258.00|1,258.00|1,258.00|1,258.00|     1,258.00|
|   mean|   72.36|   72.84|   71.92|   72.39| 8,222,093.50|
| stddev|    6.77|    6.77|    6.74|    6.76| 4,519,781.00|
|    min|   56.39|   57.06|   56.30|   56.42| 2,094,900.00|
|    max|   90.80|   90.97|   89.25|   90.47|80,898,096.00|
+-------+--------+--------+--------+--------+-------------+



## Creating new column HV Ratio that is ratio of High Price vs Volume of stock traded for a day

In [59]:
df_HV_Ratio=df.withColumn("HV Ratio",df['High']/df['Volume'])
df_HV_Ratio.select("HV Ratio").show(5)


+--------------------+
|            HV Ratio|
+--------------------+
|4.819714653321546E-6|
|6.290848613094555E-6|
|4.669412994783916E-6|
|7.367338463826307E-6|
|8.915604778943901E-6|
+--------------------+
only showing top 5 rows



## What day had the high Peak in price

In [78]:
print(df.orderBy(df['High'].desc()).show(1))   # using orderBy and show
print(df.orderBy(df['High'].desc()).head(1))   # using orderBy and head to display the rowset
df.describe().show()                           # using describe() get the max High
df.filter("High = 90.970001").show()           # use the Max High value in the df.filter() functions


+----------+---------+---------+-----+---------+-------+---------+
|      Date|     Open|     High|  Low|    Close| Volume|Adj Close|
+----------+---------+---------+-----+---------+-------+---------+
|2015-01-13|90.800003|90.970001|88.93|89.309998|8215400|83.825448|
+----------+---------+---------+-----+---------+-------+---------+
only showing top 1 row

None
[Row(Date=datetime.date(2015, 1, 13), Open=90.800003, High=90.970001, Low=88.93, Close=89.309998, Volume=8215400, Adj Close=83.825448)]
+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean| 72.35785375357

## Mean of Close column

In [84]:
df.describe().show()                         # Using describe() we can get the mean data

from pyspark.sql.functions import mean       # Other way is import mean function and use mean() for the column
df.select(mean("Close")).show()

+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean| 72.35785375357709|72.83938807631165| 71.9186009594594|72.38844998012726|8222093.481717011|67.23883848728146|
| stddev|  6.76809024470826|6.768186808159218|6.744075756255496|6.756859163732991|  4519780.8431556|6.722609449996857|
|    min|56.389998999999996|        57.060001|        56.299999|        56.419998|          2094900|        50.363689|
|    max|         90.800003|        90.970001|            89.25|        90.470001|         80898100|84.91421600000001|
+-------+------------------+-----------------+--

In [89]:
## MAX and MIN of Volumne Column

df.describe().show()

from pyspark.sql.functions import min, max
df.select(min('Volume')).show()
df.select(max('Volume')).show()

+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean| 72.35785375357709|72.83938807631165| 71.9186009594594|72.38844998012726|8222093.481717011|67.23883848728146|
| stddev|  6.76809024470826|6.768186808159218|6.744075756255496|6.756859163732991|  4519780.8431556|6.722609449996857|
|    min|56.389998999999996|        57.060001|        56.299999|        56.419998|          2094900|        50.363689|
|    max|         90.800003|        90.970001|            89.25|        90.470001|         80898100|84.91421600000001|
+-------+------------------+-----------------+--

In [108]:
## How many days Close less than 60 dollars

from pyspark.sql.functions import count
print(df.filter('Close < 60').count())

print(df.filter(df['Close']<60).count())

result = df.filter(df['Close']<60)
result.select(count('Close')).show()

81
81
+------------+
|count(Close)|
+------------+
|          81|
+------------+



## What % of High greater than 80 dollars
## (Number of days > 80) / (Total days)

In [109]:
df.filter('High > 80').count()

115

In [110]:
df.count()

1258

In [112]:
result = (df.filter('High > 80').count())/ (df.count())
result*100

9.141494435612083

## Correlation between HIgh and Volume

In [115]:
from pyspark.sql.functions import corr
df.select(corr('High','Volume')).show()

+-------------------+
| corr(High, Volume)|
+-------------------+
|-0.3384326061737161|
+-------------------+



## Max high per year

In [130]:
from pyspark.sql.functions import year
yearDf=df.withColumn("Year",year(df['Date']))
yearDf.show(5)


+----------+------------------+---------+---------+------------------+--------+------------------+----+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|Year|
+----------+------------------+---------+---------+------------------+--------+------------------+----+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|2012|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|2012|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|2012|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|2012|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|2012|
+----------+------------------+---------+---------+------------------+--------+------------------+----+
only showing top 5 rows



In [131]:
max_df = yearDf.groupBy("Year").max()
max_df.select('Year','max(High)').show()

+----+---------+
|Year|max(High)|
+----+---------+
|2015|90.970001|
|2013|81.370003|
|2014|88.089996|
|2012|77.599998|
|2016|75.190002|
+----+---------+



## Avg close for each month
## Get the Avg Close value for Jan, Feb, Mar etc.. 

In [136]:
from pyspark.sql.functions import month

monthdf=df.withColumn("Month",month(df['Date']))
monthdf.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+-----+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|Month|
+----------+------------------+---------+---------+------------------+--------+------------------+-----+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|    1|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|    1|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|    1|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|    1|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|    1|
+----------+------------------+---------+---------+------------------+--------+------------------+-----+
only showing top 5 rows



In [142]:
mdf = monthdf.groupBy("Month").avg()
mdf.select('Month','avg(Close)').orderBy('Month').show()

+-----+-----------------+
|Month|       avg(Close)|
+-----+-----------------+
|    1|71.44801958415842|
|    2|  71.306804443299|
|    3|71.77794377570092|
|    4|72.97361900952382|
|    5|72.30971688679247|
|    6| 72.4953774245283|
|    7|74.43971943925233|
|    8|73.02981855454546|
|    9|72.18411785294116|
|   10|71.57854545454543|
|   11| 72.1110893069307|
|   12|72.84792478301885|
+-----+-----------------+



In [158]:
mdf2 = monthdf.select(['Month','Close']).groupBy("Month").mean()     ## Get the mean/average of the Monthwise data
mdf2 = mdf2.withColumn('avg(Close)',format_number('avg(Close)',2))   ## Format the number to 2 digits
mdf2.select('Month','Avg(Close)').orderBy('Month').show()            ## Select the columns to display with orderBy

+-----+----------+
|Month|Avg(Close)|
+-----+----------+
|    1|     71.45|
|    2|     71.31|
|    3|     71.78|
|    4|     72.97|
|    5|     72.31|
|    6|     72.50|
|    7|     74.44|
|    8|     73.03|
|    9|     72.18|
|   10|     71.58|
|   11|     72.11|
|   12|     72.85|
+-----+----------+

