In [1]:
# Import the required libraries to process data:
import pandas as pd
import numpy as np
import matplotlib as mp
import yfinance as yf
# Import the required libraries to operate on time series:
from datetime import datetime, timedelta, date

In [12]:
### Question 1:
### Write a script to download 2 year worth of data from AAPL:
def get_raw_data(ticker="AAPL", timeSpan_in_y=1) -> pd.DataFrame:
    # Define the timeframe:
    endDate = date.today().replace(month=1, day=1)  # Format: "2024-01-01"
    startDate = (endDate - pd.DateOffset(years=timeSpan_in_y)).date()  # Format: "2023-01-01"

    try:
        # Attempt to download the data from yfinance API:
        raw_data = yf.download(ticker, start=startDate, end=endDate)
    except Exception as e:
        print(f"Error fetching data for {ticker}: {str(e)}")
        return pd.DataFrame()

    # Add the stock ticker as new feature:
    raw_data["Ticker"] = ticker

    # Flatten the MultiIndex:
    if isinstance(raw_data.columns, pd.MultiIndex):
        raw_data.columns = raw_data.columns.get_level_values(0)  # Only use the first level of the MultiIndex

    # Reset the index so that 'Date' is a column:
    raw_data = raw_data.reset_index()

    # Ensure the required columns are selected and in the correct order:
    required_columns = ['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close']
    
    # Check if required columns exist in the DataFrame:
    available_columns = [col for col in required_columns if col in raw_data.columns]

    if not available_columns:
        raise ValueError(f"Required columns {required_columns} not found in data.")

    # Return the DataFrame with the selected columns:
    raw_data = raw_data[available_columns]
    return raw_data


raw_data = get_raw_data()
print(raw_data)

[*********************100%***********************]  1 of 1 completed

Price       Date Ticker        Open        High         Low       Close  \
0     2023-01-03   AAPL  130.279999  130.899994  124.169998  125.070000   
1     2023-01-04   AAPL  126.889999  128.660004  125.080002  126.360001   
2     2023-01-05   AAPL  127.129997  127.769997  124.760002  125.019997   
3     2023-01-06   AAPL  126.010002  130.289993  124.889999  129.619995   
4     2023-01-09   AAPL  130.470001  133.410004  129.889999  130.149994   
..           ...    ...         ...         ...         ...         ...   
245   2023-12-22   AAPL  195.179993  195.410004  192.970001  193.600006   
246   2023-12-26   AAPL  193.610001  193.889999  192.830002  193.050003   
247   2023-12-27   AAPL  192.490005  193.500000  191.089996  193.149994   
248   2023-12-28   AAPL  194.139999  194.660004  193.169998  193.580002   
249   2023-12-29   AAPL  193.899994  194.399994  191.729996  192.529999   

Price   Adj Close  
0      123.768463  
1      125.045044  
2      123.718971  
3      128.271118  




In [13]:
### Question 2: 
### Write a script to format display the following previous DataFrame as a string in the following format:
### Example output:
from io import StringIO
import heapq

def process_data_to_file(df: pd.DataFrame):
    # 1) Instantiate a min heap in order to process Stocks according to ascending date:
    # Purpose: Ensure stocks are processed in chronological order:
    min_heap = []

    # 2) Iterate through each rows yielded from the iterrows() generator:
    # df.itterows() is generator method that itterates through each rows of the DataFrame:
    # For each row, it returns a tuple: {index of the row, Series object representing the row's data}
    for _, row in df.iterrows():
        # 2.1) Push the row as tupple in the min_heap => will automatically sort the data according to ascending date:
        heapq.heappush(min_heap, (row["Date"], row["Ticker"], row["Open"], row["High"], row["Low"], row["Close"], row["Adj Close"]))

    index = 1

    # 3) Instantiate a StringIO obj to redirect S/O to the object like in a file:
    result = StringIO()

    # 4) Iterrate over the min_heap:
    while min_heap:
        # 4.1) Extract the data from the min_heap according to ascending date (lowest -> highest):
        # heappop(): Removes the smallest value from the min heap and returns it:
        date, ticker, open_price, high, low, close, adj_close = heapq.heappop(min_heap)
        # 4.2) Redirect S/O to the StringIO object:
        print(f"{index} {ticker}, {date}, {open_price}, {high}, {low}, {close}, {adj_close}", file=result)
        # 4.3) Increment the index:
        index += 1

    # 5) Move the pointer back to the start of the StringIO obj:
    result.seek(0)
    # 6) Return the content of the StringIO obj as a string:
    return result.getvalue()


processed_df = process_data_to_file(raw_data)
print(processed_df)

1 AAPL, 2023-01-03 00:00:00, 130.27999877929688, 130.89999389648438, 124.16999816894531, 125.06999969482422, 123.76846313476562
2 AAPL, 2023-01-04 00:00:00, 126.88999938964844, 128.66000366210938, 125.08000183105469, 126.36000061035156, 125.0450439453125
3 AAPL, 2023-01-05 00:00:00, 127.12999725341797, 127.7699966430664, 124.76000213623047, 125.0199966430664, 123.7189712524414
4 AAPL, 2023-01-06 00:00:00, 126.01000213623047, 130.2899932861328, 124.88999938964844, 129.6199951171875, 128.2711181640625
5 AAPL, 2023-01-09 00:00:00, 130.47000122070312, 133.41000366210938, 129.88999938964844, 130.14999389648438, 128.7956085205078
6 AAPL, 2023-01-10 00:00:00, 130.25999450683594, 131.25999450683594, 128.1199951171875, 130.72999572753906, 129.3695526123047
7 AAPL, 2023-01-11 00:00:00, 131.25, 133.50999450683594, 130.4600067138672, 133.49000549316406, 132.10081481933594
8 AAPL, 2023-01-12 00:00:00, 133.8800048828125, 134.25999450683594, 131.44000244140625, 133.41000366210938, 132.02166748046875


In [16]:
### Question 3:
### Convert a string generated into a DataFrame:
import re

def convert_str_to_df(str_input: str) -> pd.DataFrame:
    # 1) Purpose: Split the string into rows ignoring white spaces:
    # - strip(): Remove leading and trailing newlines
    # - .split('\n'): Splits the string into a list of lines wherever a newline character (\n) is found:
    # - lines: Now contains a list of strings, each corresponding to a line of data.
    lines = str_input.strip().split('\n')

    # 2) Instantiate a list that will host many lists of data. each innter list represents a row:
    # data: list that will hold the parsed rows of data as individual lists (e.g., ["AAPL", "2023-01-03", 130.28, ...]):
    data = []

    # 3) Iterate over each lines:
    for line in lines:
        # 3.1) Purpose: split each components of a line:
        # - re.split(): split the line into parts when a certain pattern is met:
        # - r'\s*,\s*': {\s*: zero or more whitespace characters, ,: comma, \s*: Whitespace after the comma}
        # Ex input: line = "1 AAPL, 2023-01-03 00:00:00, 130.27, 130.90, 124.17, 125.07"
        # Ex output: parts: ["1 AAPL", "2023-01-03 00:00:00", "130.27", "130.90", "124.17", "125.07"]
        parts = re.split(r'\s*,\s*', line)

        # 3.2) Ensures there 7 features:
        if len(parts)!=7:
            print(f"skipping malformed row: {line}")
            continue

        # 3.3) Extract each features:
        # parts[0]: Refers to the 1st element of parts which is "1 AAPL".
        # .split(): Splits the "1 AAPL" string into ["1", "AAPL"]
        # [1]: extract ["AAPL"]
        ticker = parts[0].split()[1]
        date = parts[1].strip()
        open_price = float(parts[2].strip())
        high = float(parts[3].strip())
        low = float(parts[4].strip())
        close = float(parts[5].strip())
        adj_close = float(parts[6].strip())

        # Combine the parsed value into a string:
        data.append([ticker, date, open_price, high, low, close, adj_close])

    # Create a DataFrame from the parsed data:
    df = pd.DataFrame(data, columns=["Ticker", "Date", "Open", "High", "Low", "Close", "Adj Close"])
    # Convert the date column into a datetime obj:
    df["Date"] = pd.to_datetime(df["Date"])
    # Round numeral features to 3 decimals:
    df[["Open", "High", "Low", "Close", "Adj Close"]] = df[["Open", "High", "Low", "Close", "Adj Close"]].round(3)
    # Return the DataFrame:
    return df

df = convert_str_to_df(processed_df)
print(df)
        

    Ticker       Date    Open    High     Low   Close  Adj Close
0     AAPL 2023-01-03  130.28  130.90  124.17  125.07    123.768
1     AAPL 2023-01-04  126.89  128.66  125.08  126.36    125.045
2     AAPL 2023-01-05  127.13  127.77  124.76  125.02    123.719
3     AAPL 2023-01-06  126.01  130.29  124.89  129.62    128.271
4     AAPL 2023-01-09  130.47  133.41  129.89  130.15    128.796
..     ...        ...     ...     ...     ...     ...        ...
245   AAPL 2023-12-22  195.18  195.41  192.97  193.60    192.656
246   AAPL 2023-12-26  193.61  193.89  192.83  193.05    192.109
247   AAPL 2023-12-27  192.49  193.50  191.09  193.15    192.208
248   AAPL 2023-12-28  194.14  194.66  193.17  193.58    192.636
249   AAPL 2023-12-29  193.90  194.40  191.73  192.53    191.591

[250 rows x 7 columns]


In [20]:
### Question 4:
### Compute Monthly Returns for the past 2 years:
def compute_monthly_returns(df: pd.DataFrame) -> pd.DataFrame:
    # 1) Compute daily returns:
    df["Daily Return"] = df["Adj Close"].pct_change()
    # 2) Remove Nan values:
    df.dropna(axis=0, subset=["Daily Return"], inplace=True)
    # 3) Reshape the DF by months and compute monthly returns:
    monthly_returns = df.resample('M', on='Date')["Daily Return"].apply(lambda x: (1+x).prod()-1)
    # 4) Convert the series to a DataFrame and reset the index:
    monthly_returns = monthly_returns.reset_index()
    # 5) Add the Ticker column
    monthly_returns['Ticker'] = df['Ticker'].iloc[0]
    monthly_returns.columns=["Date", "Monthly Return", "Ticker"]
    # 5) Return the dataFrame:
    return monthly_returns

df_with_monthly_returns = compute_monthly_returns(df)
print(df_with_monthly_returns)

         Date  Monthly Return Ticker
0  2023-01-31        0.154132   AAPL
1  2023-02-28        0.023188   AAPL
2  2023-03-31        0.118646   AAPL
3  2023-04-30        0.028990   AAPL
4  2023-05-31        0.046054   AAPL
5  2023-06-30        0.094329   AAPL
6  2023-07-31        0.012789   AAPL
7  2023-08-31       -0.042386   AAPL
8  2023-09-30       -0.088679   AAPL
9  2023-10-31       -0.002568   AAPL
10 2023-11-30        0.113780   AAPL
11 2023-12-31        0.013580   AAPL


  monthly_returns = df.resample('M', on='Date')["Daily Return"].apply(lambda x: (1+x).prod()-1)
