In [13]:
# Import the required library to process data:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib as plt
# Import the required library to process dates:
from datetime import datetime, timedelta, date

In [14]:
### Question 1: Create a function to gather 2 year worth of stocks for AAPL, TSLA, AMZN:
def fetch_data(tickers=["AAPL", "TSLA", "AMZN"], timeFrame_in_years = 2) -> pd.DataFrame:
    # 1) Define the time interval to fetch data:
    endDate = date.today()
    startDate = (endDate - pd.DateOffset(years=timeFrame_in_years)).date()
    # 2) Define an list to store all DataFrame generated through the iterration:
    data = []
    
    # 3) Iterate through the tickers list:
    for ticker in tickers:
        try: # 2.1) Attempt to gather 2 years worth of data:
            df = yf.download(tickers=ticker, start=startDate, end=endDate, progress=False)
            # 2.2) Assess if there is a multi index if true => Remove it:
            if isinstance(df.columns, pd.MultiIndex):
                # Only use the first level of the multi index:
                df.columns = df.columns.get_level_values(0)   
            # 2.3) Reset the index:
            df.reset_index(inplace=True)
            # 2.4) Define the features to keep:
            df = df[["Date", "Open", "High", "Low", "Close", "Adj Close"]]
            # 2.5) Add the ticker as feature:
            df["Ticker"] = ticker
            # 2.6) Append the DataFrame to the data list:
            data.append(df)
        except Exception as e:
            print(f"Couldn't download data for {ticker}: {str(e)}")
            continue

    # 4) Concatenate all DataFrames in data into a single DataFrame:
    df = pd.concat(data, ignore_index=True)
    return df

raw_df = fetch_data()
print(raw_df)

Price       Date        Open        High         Low       Close   Adj Close  \
0     2022-12-09  142.339996  145.570007  140.899994  142.160004  140.680618   
1     2022-12-12  142.699997  144.500000  141.059998  144.490005  142.986343   
2     2022-12-13  149.500000  149.970001  144.240005  145.470001  143.956161   
3     2022-12-14  145.350006  146.660004  141.160004  143.210007  141.719696   
4     2022-12-15  141.110001  141.800003  136.029999  136.500000  135.079498   
...          ...         ...         ...         ...         ...         ...   
1498  2024-12-02  209.960007  212.990005  209.509995  210.710007  210.710007   
1499  2024-12-03  210.309998  214.020004  209.649994  213.440002  213.440002   
1500  2024-12-04  215.960007  220.000000  215.750000  218.160004  218.160004   
1501  2024-12-05  218.029999  222.149994  217.300003  220.550003  220.550003   
1502  2024-12-06  220.750000  227.149994  220.600006  227.029999  227.029999   

Price Ticker  
0       AAPL  
1       A

In [15]:
### Question 2: Write a script to format display the following previous DataFrame as a string in the following format:
### 1 AAPL, 2023-01-03 00:00:00, 130.27999877929688, 130.89999389648438, 124.16999816894531, 125.06999969482422, 123.76846313476562
from io import StringIO
import heapq

def process_data_to_fileFormat(df: pd.DataFrame) -> str:
    # 1) Define a min heap to process data according to chronological order:
    min_heap = []
    
    # 2) Iterrate through all rows yielded by iterrows() ignoring indexes (_):
    for _, row in df.iterrows():
        # 2.1) Push the current row to the min heap:
        heapq.heappush(min_heap, (row["Date"], row["Ticker"], row["Open"], row["High"], row["Low"], row["Close"], row["Adj Close"]))
    index = 1

    # 3) Instantiate a StringIO object to redirect standard output to it like a text file:
    result = StringIO()

    # 4) Iterrate through the min_heap:
    while min_heap:
        # 4.1) Get the required data and store them into variable:
        date, ticker, open_price, high, low, close, adj_close = heapq.heappop(min_heap)
        # 4.2) Redirect standard output through the StringIO object like in a text file:
        print(f"{index} {ticker}, {date}, {open_price}, {high}, {low}, {close}, {adj_close}", file=result)
        # 4.3) Increment the index:
        index += 1

    # 5) Reposition the pointer in the StringIO at the beginning:
    result.seek(0)
    # 6) Return the content of the StringIO object:
    return result.getvalue()

file_format_data = process_data_to_fileFormat(raw_df)
print(file_format_data)

1 AAPL, 2022-12-09 00:00:00, 142.33999633789062, 145.57000732421875, 140.89999389648438, 142.16000366210938, 140.6806182861328
2 AMZN, 2022-12-09 00:00:00, 88.9000015258789, 90.30000305175781, 88.62999725341797, 89.08999633789062, 89.08999633789062
3 TSLA, 2022-12-09 00:00:00, 173.83999633789062, 182.5, 173.36000061035156, 179.0500030517578, 179.0500030517578
4 AAPL, 2022-12-12 00:00:00, 142.6999969482422, 144.5, 141.05999755859375, 144.49000549316406, 142.98634338378906
5 AMZN, 2022-12-12 00:00:00, 89.20999908447266, 90.58000183105469, 87.87000274658203, 90.55000305175781, 90.55000305175781
6 TSLA, 2022-12-12 00:00:00, 176.10000610351562, 177.3699951171875, 167.52000427246094, 167.82000732421875, 167.82000732421875
7 AAPL, 2022-12-13 00:00:00, 149.5, 149.97000122070312, 144.24000549316406, 145.47000122070312, 143.95616149902344
8 AMZN, 2022-12-13 00:00:00, 95.2300033569336, 96.25, 90.5199966430664, 92.48999786376953, 92.48999786376953
9 TSLA, 2022-12-13 00:00:00, 174.8699951171875, 17

In [24]:
### Question 3: Create a function that will convert the file format data back into a DataFrame:
import re

def fileData_to_dataFrame(str_input: str) -> pd.DataFrame:
    # 1) Split the str_input into lines:
    # - strip(): Ensures to remove leading and trailing white spaces:
    # - split('\n'): Split the data each time there is a => Separate the data into lines:
    lines = str_input.strip().split('\n')

    # 2) Define a list that will hold a list of list:
    data = []
    # 3) Iterrate through each line in lines:
    for line in lines:
        # 3.1) split each components of a line:
        # - re.split(): split the string into multiple components when a pattern is met:
        # - r'\s*,\s*': pattern: 0 or + while space followed by , 0 or + followed by white space
        # Ex input: line = "1 AAPL, 2023-01-03 00:00:00, 130.27, 130.90, 124.17, 125.07"
        # Ex output: parts: ["1 AAPL", "2023-01-03 00:00:00", "130.27", "130.90", "124.17", "125.07"]
        parts = re.split(r'\s*,\s*', line)

        # 3.2) Ensure there are 7 columes:
        if len(parts)!=7:
            print(f"Skipping malformed fow {line}")
            return pd.DataFrame()

        # 3.3) Extract each features:
        ticker = parts[0].split()[1]
        date = parts[1].strip()
        open_price = float(parts[2].strip())
        high = float(parts[3].strip())
        low = float(parts[4].strip())
        close = float(parts[5].strip())
        adj_close = float(parts[6].strip())
        # 3.4) Combine the data into a list and append it to the data list: => Form a list of list:
        data.append([ticker, date, open_price, high, low, close, adj_close])

    # 4) Create a dataframe from the list of list: At this point the DataFrame isnt structured according to ticker and date:
    unorganized_df = pd.DataFrame(data, columns=["Ticker", "Date", "Open", "High", "Low", "Close", "Adj Close"])
    # 4.1) Convert the date column back into a datetime object:
    unorganized_df["Date"] = pd.to_datetime(unorganized_df["Date"])
    # 4.2) Sort the DataFrame according to ticker and date in ascending order:
    organized_df = unorganized_df.sort_values(by=["Ticker", "Date"]).reset_index(drop=True)
    return organized_df

df = fileData_to_dataFrame(file_format_data)
print(df)
    

     Ticker       Date        Open        High         Low       Close  \
0      AAPL 2022-12-09  142.339996  145.570007  140.899994  142.160004   
1      AAPL 2022-12-12  142.699997  144.500000  141.059998  144.490005   
2      AAPL 2022-12-13  149.500000  149.970001  144.240005  145.470001   
3      AAPL 2022-12-14  145.350006  146.660004  141.160004  143.210007   
4      AAPL 2022-12-15  141.110001  141.800003  136.029999  136.500000   
...     ...        ...         ...         ...         ...         ...   
1498   TSLA 2024-12-02  352.380005  360.000000  351.149994  357.089996   
1499   TSLA 2024-12-03  351.799988  355.690002  348.200012  351.420013   
1500   TSLA 2024-12-04  353.000000  358.100006  348.600006  357.929993   
1501   TSLA 2024-12-05  359.869995  375.429993  359.500000  369.489990   
1502   TSLA 2024-12-06  377.420013  389.489990  370.799988  389.220001   

       Adj Close  
0     140.680618  
1     142.986343  
2     143.956161  
3     141.719696  
4     135.079498

In [28]:
### Question 4: Compute the monthly return:
def compute_monthly_return(df: pd.DataFrame) -> pd.DataFrame:
    # 1) Compute daily returns for each stock:
    df["Daily Returns"] = df.groupby("Ticker")["Adj Close"].pct_change()
    # 2) Remove NaN values:
    df.dropna(axis=0, subset=["Daily Returns"], inplace=True)
    # 3) Set the Date column as the index for resampling:
    df.set_index("Date", inplace=True)
    # 4) Compute monthly returns for each stock:
    monthly_returns = df.groupby("Ticker").resample('M')["Daily Returns"].apply(lambda x: (1 + x).prod() - 1)
    # 5) Reset the index:
    monthly_returns = monthly_returns.reset_index()
    # 6) Rename the columns for clarity:
    monthly_returns.columns = ["Ticker", "Date", "Monthly Returns"]
    return monthly_returns


df_monthly_returns = compute_monthly_return(df)
print(df_monthly_returns)

   Ticker       Date  Monthly Returns
0    AAPL 2022-12-31        -0.092731
1    AAPL 2023-01-31         0.110521
2    AAPL 2023-02-28         0.023183
3    AAPL 2023-03-31         0.118649
4    AAPL 2023-04-30         0.028987
..    ...        ...              ...
70   TSLA 2024-08-31        -0.077390
71   TSLA 2024-09-30         0.221942
72   TSLA 2024-10-31        -0.045025
73   TSLA 2024-11-30         0.381469
74   TSLA 2024-12-31         0.127651

[75 rows x 3 columns]


  monthly_returns = df.groupby("Ticker").resample('M')["Daily Returns"].apply(lambda x: (1 + x).prod() - 1)
