In [1]:
import findspark 
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Walmart Analysis1").getOrCreate()

In [9]:
sc = spark.sparkContext

In [20]:
stock_rdd = sc.textFile("walmart_stock.csv")

In [21]:
header = stock_rdd.first()

In [23]:
stocks = stock_rdd.filter(lambda line: line!=header)

In [26]:
stocks.take(5)

['2012-01-03,59.970001,61.060001,59.869999,60.330002,12668800,52.619234999999996',
 '2012-01-04,60.209998999999996,60.349998,59.470001,59.709998999999996,9593300,52.078475',
 '2012-01-05,59.349998,59.619999,58.369999,59.419998,12768200,51.825539',
 '2012-01-06,59.419998,59.450001,58.869999,59.0,8069400,51.45922',
 '2012-01-09,59.029999,59.549999,58.919998,59.18,6679300,51.616215000000004']

### Q1. Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day.

In [27]:
stocks_split = stocks.map(lambda x : x.split(','))

In [28]:
stocks_split.take(3)

[['2012-01-03',
  '59.970001',
  '61.060001',
  '59.869999',
  '60.330002',
  '12668800',
  '52.619234999999996'],
 ['2012-01-04',
  '60.209998999999996',
  '60.349998',
  '59.470001',
  '59.709998999999996',
  '9593300',
  '52.078475'],
 ['2012-01-05',
  '59.349998',
  '59.619999',
  '58.369999',
  '59.419998',
  '12768200',
  '51.825539']]

In [71]:
hv_ratio = stocks_split.map(lambda x : (x[0],round(int(x[5])/float(x[2]),2)))

In [73]:
hv_ratio.take(5)

[('2012-01-03', 207481.16),
 ('2012-01-04', 158961.07),
 ('2012-01-05', 214159.68),
 ('2012-01-06', 135734.23),
 ('2012-01-09', 112162.89)]

### Which 10 day had the Peak High in Price?

In [78]:
stock_high = stocks_split.map(lambda x:(x[0], round(float(x[2]),2)))

In [79]:
stock_high.take(5)

[('2012-01-03', 61.06),
 ('2012-01-04', 60.35),
 ('2012-01-05', 59.62),
 ('2012-01-06', 59.45),
 ('2012-01-09', 59.55)]

In [82]:
top_10_day = stock_high.sortBy((lambda x:x[1]),False)

In [138]:
top_10 = sc.parallelize(top_10_day.take(10))

[('2015-01-13', 90.97),
 ('2015-01-08', 90.67),
 ('2015-01-09', 90.39),
 ('2015-01-12', 90.31),
 ('2015-01-23', 89.26),
 ('2015-01-26', 89.16),
 ('2015-01-07', 88.68),
 ('2015-01-14', 88.52),
 ('2015-01-27', 88.46),
 ('2015-01-22', 88.4)]

### Q3.What is the max and min of the Volume column?

In [143]:
volume_col = stocks_split.map(lambda x: int(x[5]))

In [132]:
volume = (volume_col.max(), volume_col.min())

In [135]:
 volume

(80898100, 2094900)

###  Q4. How many days was the Close lower than 60 dollars?


In [162]:
day_count = stocks_split.map(lambda x: (x[0],float(x[4])) if (float(x[4]) < 60) else (x[0],0) )

In [163]:
day_count.take(5)

[('2012-01-03', 0),
 ('2012-01-04', 59.709998999999996),
 ('2012-01-05', 59.419998),
 ('2012-01-06', 59.0),
 ('2012-01-09', 59.18)]

In [177]:
day_count.filter(lambda x: x[1]).count()

81

### Q5. What percentage of the time was the High greater than 80 dollars In other words, (Number of Days High>80)/(Total Days in the dataset)


In [174]:
high_than_80 = stocks_split.map(lambda x: [x[0], int(float(x[2])>80)])

In [186]:
dataset_count = high_than_80.count()

In [185]:
high_count = high_than_80.filter(lambda x: x[1]).count()

In [187]:
percent_times = high_count/dataset_count *100

In [188]:
percent_times

9.141494435612083

### Q6. What is the Pearson correlation between High and Volume?

In [183]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import corr


stock_data = spark.read.csv("walmart_stock.csv", header=True, inferSchema=True)

# Calculate the Pearson correlation between 'High' and 'Volume'
correlation = stock_data.select(corr("High", "Volume")).collect()[0][0]

# Print the correlati1on coefficient
print("Pearson correlation coefficient: ", correlation)

Pearson correlation coefficient:  -0.3384326061737161


### Q7. What is the max High per year?

In [202]:
year_count = stocks_split.map(lambda x: (x[0][0:4],round(float(x[4]),2)))

In [203]:
year_count.take(5)

[('2012', 60.33),
 ('2012', 59.71),
 ('2012', 59.42),
 ('2012', 59.0),
 ('2012', 59.18)]

In [210]:
max_high = year_count.reduceByKey(max)

In [211]:
max_high.take(5)

[('2012', 77.15),
 ('2013', 81.21),
 ('2014', 87.54),
 ('2015', 90.47),
 ('2016', 74.3)]

### Q8. What is the average Close for each Calendar Month?

In [238]:
avg_d = stocks_split.map(lambda x: (int(x[0].split('-')[1]),round(float(x[2]),2))) 

In [239]:
avg_d.take(5)

[(1, 61.06), (1, 60.35), (1, 59.62), (1, 59.45), (1, 59.55)]

In [248]:
avg_group = avg_d.groupByKey()

In [249]:
avg_monthWise =avg_group.mapValues(lambda x: sum(x)/len(x))