In [3]:
# Import the required libraries for data processing:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib as plt
# Import the required libraries to process date:
from datetime import datetime, timedelta, date

In [5]:
### Question 1: Get 2 year worth of data from APPLE:
def fetch_raw_data(ticker="AAPL", timespan_in_y=2) -> pd.DataFrame:
    #1) Define the time interval:
    endDate = date.today().replace(month=1, day=1) # Format: 2024-01-01:
    startDate = (endDate - pd.DateOffset(years=timespan_in_y)).date()

    try: # 2) Attempt to fetch data from yfinance API:
        df = yf.download(tickers=ticker, start=startDate, end=endDate, progress=False)

    except Exception as e:
        print(f"Error couldn't fetch data for {ticker}: {str(e)}")
        return pd.DataFrame()

    # 3) Assess if there is a multi index and if so => remove it:
    if isinstance(df.columns, pd.MultiIndex):
        # Only use the first level of the multi index:
        df.columns = df.columns.get_level_values(0)

    # 4) Reset the index:
    df.reset_index(inplace=True)
    # 5) Only keep the necessary features:
    df = df[["Date", "Open", "High", "Low", "Close", "Adj Close"]]
    # 6) Add the ticker symbol
    df["Ticker"] = ticker
    # 7) Return the DataFrame:
    return df


raw_df = fetch_raw_data()
print(raw_df)

Price       Date        Open        High         Low       Close   Adj Close  \
0     2022-01-03  177.830002  182.880005  177.710007  182.009995  179.076599   
1     2022-01-04  182.630005  182.940002  179.119995  179.699997  176.803818   
2     2022-01-05  179.610001  180.169998  174.639999  174.919998  172.100876   
3     2022-01-06  172.699997  175.300003  171.639999  172.000000  169.227966   
4     2022-01-07  172.889999  174.139999  171.029999  172.169998  169.395187   
..           ...         ...         ...         ...         ...         ...   
496   2023-12-22  195.179993  195.410004  192.970001  193.600006  192.656174   
497   2023-12-26  193.610001  193.889999  192.830002  193.050003  192.108856   
498   2023-12-27  192.490005  193.500000  191.089996  193.149994  192.208374   
499   2023-12-28  194.139999  194.660004  193.169998  193.580002  192.636276   
500   2023-12-29  193.899994  194.399994  191.729996  192.529999  191.591370   

Price Ticker  
0       AAPL  
1       A

In [9]:
### Question 2: Write a script to format display the following previous DataFrame as a string in the following format:
### 1 AAPL, 2023-01-03 00:00:00, 130.27999877929688, 130.89999389648438, 124.16999816894531, 125.06999969482422, 123.76846313476562
from io import StringIO
import heapq

def convert_data_to_string(df: pd.DataFrame) -> str:
    # 1) Define a min heap to store data according to Ascending date:
    # Purpose: Ensure stocks are processed in chronological order:
    min_heap = []

    # 2) Iterrate through each rows yielded from the iterrows() generator:
    # iterrows() itterates through all the rows in the dataframe
    # for each rows it generates a tupple (index of the row, series representing all the features):
    for _, row in df.iterrows():
        # 2.1) Push rows data into the min_heap:
        heapq.heappush(min_heap, (row["Date"], row["Ticker"], row["Open"], row["High"], row["Low"], row["Close"], row["Adj Close"]))

    index = 1
    # 3) Instantiate a StringIO object to redirect standard output like in a file to a variable:
    result = StringIO()

    # 4) Itrerrate through the min heap and pop data to write it to the StringIO obj:
    while min_heap:
        # 4.1) Gather all the fields from the row popped:
        date, ticker, open_price, high, low, close, adj_close = heapq.heappop(min_heap)
        # 4.2) Redirect data to the StringIO object:
        print(f"{index} {ticker}, {date}, {open_price}, {high}, {low}, {close}, {adj_close}", file=result)
        # 4.3) Increment the index:
        index += 1

    # 5) Ensure the pointer points to the beginning of the StringIO obj:
    result.seek(0)
    # 6) Return the content of the StringIO object:
    return result.getvalue()

file_data = convert_data_to_string(raw_df)
print(file_data)
    
        

1 AAPL, 2022-01-03 00:00:00, 177.8300018310547, 182.8800048828125, 177.7100067138672, 182.00999450683594, 179.07659912109375
2 AAPL, 2022-01-04 00:00:00, 182.6300048828125, 182.94000244140625, 179.1199951171875, 179.6999969482422, 176.80381774902344
3 AAPL, 2022-01-05 00:00:00, 179.61000061035156, 180.1699981689453, 174.63999938964844, 174.9199981689453, 172.1008758544922
4 AAPL, 2022-01-06 00:00:00, 172.6999969482422, 175.3000030517578, 171.63999938964844, 172.0, 169.22796630859375
5 AAPL, 2022-01-07 00:00:00, 172.88999938964844, 174.13999938964844, 171.02999877929688, 172.1699981689453, 169.3951873779297
6 AAPL, 2022-01-10 00:00:00, 169.0800018310547, 172.5, 168.1699981689453, 172.19000244140625, 169.4148712158203
7 AAPL, 2022-01-11 00:00:00, 172.32000732421875, 175.17999267578125, 170.82000732421875, 175.0800018310547, 172.25830078125
8 AAPL, 2022-01-12 00:00:00, 176.1199951171875, 177.17999267578125, 174.82000732421875, 175.52999877929688, 172.70103454589844
9 AAPL, 2022-01-13 00:0

In [11]:
### Question 3: Convert the data as a string back to a pd.DataFrame:
import re

def convert_file_data_to_df(str_input: str) -> pd.DataFrame():
    # 1) Split the string into rows ignoring white spaces:
    # - strip(): remove leading and trailing new lines:
    # - .split('\n'): Split the string into a list of new lines when a new line character \n is found:
    # - lines: Contains a list of string each corresponding to line:
    lines = str_input.strip().split('\n')

    # 2) Instantiate a list that will contain multiple list of data => List of list:
    data = []

    # 3) Iterrate over the string_input:
    for line in lines:
        # 3.1) split each components of a line:
        # - re.split(): split the string into multiple components when a pattern is met:
        # - r'\s*,\s*': pattern: 0 or + while space followed by , 0 or + followed by white space
        # Ex input: line = "1 AAPL, 2023-01-03 00:00:00, 130.27, 130.90, 124.17, 125.07"
        # Ex output: parts: ["1 AAPL", "2023-01-03 00:00:00", "130.27", "130.90", "124.17", "125.07"]
        parts = re.split(r'\s*,\s*', line)

        # 3.2) Ensures there are 7 columns:
        if len(parts)!= 7:
            print(f"Skipping malformed fow {line}")
            return pd.DataFrame()

        # 3.3) Extract each features:
        ticker = parts[0].split()[1] # Part[0] = '1 AAPL', parts[0].split()[1] = 'AAPL'
        date = parts[1].strip()
        open_price = float(parts[2].strip())
        high = float(parts[3].strip())
        low = float(parts[4].strip())
        close = float(parts[5].strip())
        adj_close = float(parts[6].strip())

        # 3.4) Combine add the data into the list: => List of list:
        data.append([ticker, date, open_price, high, low, close, adj_close])
    
    # 3.5) Create a dataFrame from the parsed data
    df = pd.DataFrame(data, columns=["Ticker", "Date", "Open", "High", "Low", "Close", "Adj Close"])
    # 3.6) Convert the date column into a date time object:
    df["Date"] = pd.to_datetime(df["Date"])
    # 3.7) Round the numerical value to 3 decimals:
    df[["Open", "High", "Low", "Close", "Adj Close"]]= df[["Open", "High", "Low", "Close", "Adj Close"]].round(3)
    # 3.8) Return the dataframe:
    return df

df = convert_file_data_to_df(file_data)
print(df)

    Ticker       Date    Open    High     Low   Close  Adj Close
0     AAPL 2022-01-03  177.83  182.88  177.71  182.01    179.077
1     AAPL 2022-01-04  182.63  182.94  179.12  179.70    176.804
2     AAPL 2022-01-05  179.61  180.17  174.64  174.92    172.101
3     AAPL 2022-01-06  172.70  175.30  171.64  172.00    169.228
4     AAPL 2022-01-07  172.89  174.14  171.03  172.17    169.395
..     ...        ...     ...     ...     ...     ...        ...
496   AAPL 2023-12-22  195.18  195.41  192.97  193.60    192.656
497   AAPL 2023-12-26  193.61  193.89  192.83  193.05    192.109
498   AAPL 2023-12-27  192.49  193.50  191.09  193.15    192.208
499   AAPL 2023-12-28  194.14  194.66  193.17  193.58    192.636
500   AAPL 2023-12-29  193.90  194.40  191.73  192.53    191.591

[501 rows x 7 columns]


In [18]:
### Question 4: Compute the monthly return:
def compute_monthly_return(df: pd.DataFrame) -> pd.DataFrame():
    # 1) Compute daily returns:
    df["Daily Returns"] = df["Adj Close"].pct_change()
    # 2) Remove nan values:
    df.dropna(axis=0, subset=["Daily Returns"], inplace=True)
    # 3) Reshape the DataFrame by month and compute monthly returns:
    monthly_returns = df.resample('M', on='Date')["Daily Returns"].apply(lambda x: (1+x).prod()-1)
    # 4) Convert the series to a dataframe and reset the index:
    monthly_returns = monthly_returns.reset_index()
    # 5) Add the ticker columns:
    monthly_returns["Ticker"] = df["Ticker"].iloc[0]
    monthly_returns.columns=["Date", "Monthly Returns", "Ticker"]
    # 6) Return the dataframe:
    return monthly_returns

monthly_return_df = compute_monthly_return(df)
print(monthly_return_df)


         Date  Monthly Returns Ticker
0  2022-01-31        -0.001713   AAPL
1  2022-02-28        -0.054064   AAPL
2  2022-03-31         0.057474   AAPL
3  2022-04-30        -0.097131   AAPL
4  2022-05-31        -0.054499   AAPL
5  2022-06-30        -0.081427   AAPL
6  2022-07-31         0.188633   AAPL
7  2022-08-31        -0.031211   AAPL
8  2022-09-30        -0.120978   AAPL
9  2022-10-31         0.109554   AAPL
10 2022-11-30        -0.033025   AAPL
11 2022-12-31        -0.122275   AAPL
12 2023-01-31         0.110517   AAPL
13 2023-02-28         0.023188   AAPL
14 2023-03-31         0.118646   AAPL
15 2023-04-30         0.028990   AAPL
16 2023-05-31         0.046054   AAPL
17 2023-06-30         0.094329   AAPL
18 2023-07-31         0.012789   AAPL
19 2023-08-31        -0.042386   AAPL
20 2023-09-30        -0.088679   AAPL
21 2023-10-31        -0.002568   AAPL
22 2023-11-30         0.113780   AAPL
23 2023-12-31         0.013580   AAPL


  monthly_returns = df.resample('M', on='Date')["Daily Returns"].apply(lambda x: (1+x).prod()-1)


In [19]:
### Question 5) Compute yearly returns:
def compute_yearly_returns(df: pd.DataFrame) -> pd.DataFrame():
    # 1) reshape the DataFrame and compute Yearly returns_
    yearly_returns = df.resample('A', on="Date")['Monthly Returns'].apply(lambda x: (1+x).prod()-1)
    # 2) restructure it into a dataframe and reset the index:
    yearly_returns = yearly_returns.reset_index()
    # 3) Add the ticker column:
    yearly_returns["Ticker"] = df["Ticker"].iloc[0]
    yearly_returns.columns = ["Date", "Yearly Returns", "Ticker"]
    # 4) return the dataframe:
    return yearly_returns

df_yearly_returns = compute_yearly_returns(monthly_return_df)
print(df_yearly_returns)

AttributeError: 'DataFrame' object has no attribute 'reshape'