# Check Data Before UPSERT Operation

In [2]:
df = spark.sql("SELECT * FROM Atlys_Lakehouse.top10companies WHERE Company = 'ICICIBANK.BSE' ORDER BY Date DESC LIMIT 5")
display(df)

StatementMeta(, 9c65d216-7eb3-4c81-8b70-aa4193d0883b, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 0d3bb1aa-198c-43a3-b5ba-61a70a67df0a)

# Initiated The Spark Session
# Created the Schema
# Fetched daily data for Top 10 Companies
# Converted the data type for the fetched Data
# Loaded the Delta Table
# Made the UPSERT Query 
# Matching the data for each Company in the Delta Table
# If the date from fetched data for the company is not present in Delta table then Insert the data
# Else Do Nothing

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import datetime
import json
import urllib.request
from delta.tables import *


spark = SparkSession.builder.appName("DailyStockDataAppend").getOrCreate()

api_key = 'M38UN0AZ1TT9LTLT'

tickers = ['RELIANCE.BSE', 'TCS.BSE', 'HDFCBANK.BSE', 'INFY.BSE', 'ICICIBANK.BSE', 'HINDUNILVR.BSE', 'KOTAKBANK.BSE', 'ITC.BSE', 'BAJFINANCE.BSE', 'BHARTIARTL.BSE']

schema = StructType([
    StructField("Date", DateType(), True),
    StructField("Open", FloatType(), True),
    StructField("High", FloatType(), True),
    StructField("Low", FloatType(), True),
    StructField("Close", FloatType(), True),
    StructField("Volume", IntegerType(), True),
    StructField("Company", StringType(), True)
])

yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
yesterday_str = yesterday.strftime('%Y-%m-%d')

def fetch_data(ticker):
    url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={ticker}&outputsize=compact&apikey={api_key}'
    with urllib.request.urlopen(url) as response:
        data = json.loads(response.read().decode())
    
    if 'Time Series (Daily)' not in data:
        print(f"No data found for {ticker}")
        return None

    time_series = data['Time Series (Daily)']
    
    if yesterday_str in time_series:
        metrics = time_series[yesterday_str]
        return [(yesterday_str,  (metrics['1. open']), (metrics['2. high']), (metrics['3. low']), (metrics['4. close']), (metrics['5. volume']), ticker)]
    else:
        print(f"No data for {ticker} on {yesterday_str}")
        return None

for ticker in tickers:
    
    print(f"Fetching data for {ticker} for {yesterday_str}")
    data = fetch_data(ticker)
    print(data)
    data_converted = [
        (
            datetime.datetime.strptime(row[0], '%Y-%m-%d').date(), 
            float(row[1]), 
            float(row[2]), 
            float(row[3]), 
            float(row[4]), 
            int(row[5]),
            row[6]
        ) 
        for row in data
    ]
    df = spark.createDataFrame(data_converted, schema)
    df.show()
    df.printSchema()
    deltaTable = DeltaTable.forPath(spark, 'Tables/top10companies')
    dfUpdates = df
    deltaTable.alias('top10') \
        .merge(
            dfUpdates.alias('updates'),
            'top10.Company = updates.Company and top10.Date = updates.Date'
        ) \
        .whenMatchedUpdate(set =
            {
                
            }
        ) \
        .whenNotMatchedInsert(values =
            {
            "Date": "updates.Date",
            "Open": "updates.Open",
            "High": "updates.High",
            "Low": "updates.Low",
            "Close": "updates.Close",
            "Volume": "updates.Volume",
            "Company": "updates.Company"
            }
        ) \
        .execute()


StatementMeta(, 9c65d216-7eb3-4c81-8b70-aa4193d0883b, 6, Finished, Available, Finished)

Fetching data for RELIANCE.BSE for 2024-07-04
[('2024-07-04', '3115.8000', '3134.2000', '3102.3500', '3106.4000', '132470', 'RELIANCE.BSE')]
+----------+------+------+-------+------+------+------------+
|      Date|  Open|  High|    Low| Close|Volume|     Company|
+----------+------+------+-------+------+------+------------+
|2024-07-04|3115.8|3134.2|3102.35|3106.4|132470|RELIANCE.BSE|
+----------+------+------+-------+------+------+------------+

root
 |-- Date: date (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Company: string (nullable = true)

Fetching data for TCS.BSE for 2024-07-04
[('2024-07-04', '4004.9500', '4047.7500', '3979.3501', '4021.2500', '56861', 'TCS.BSE')]
+----------+-------+-------+-------+-------+------+-------+
|      Date|   Open|   High|    Low|  Close|Volume|Company|
+----------+-------+-------+-------+----

TypeError: 'NoneType' object is not iterable

# Checking if UPSERT Method Worked

In [6]:
df = spark.sql("SELECT * FROM Atlys_Lakehouse.top10companies WHERE Company = 'RELIANCE.BSE' ORDER BY Date DESC LIMIT 5")
display(df)

StatementMeta(, 9c65d216-7eb3-4c81-8b70-aa4193d0883b, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, cc3fac91-06c6-49d9-b18c-efd7912b8234)

# WORKED !!!

In [2]:
df = spark.sql("SELECT Company,COUNT(*) FROM Atlys_Lakehouse.top10companies GROUP BY Company")
display(df)

StatementMeta(, a8a307a7-2162-4a20-8e55-7e035a142442, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, eb94129a-85b4-43a4-aaa3-43723625f730)