In [42]:
# Imports
import pandas as pd
import os
import sys

# Importing Data

A MultiIndexed DataFrame in pandas is a DataFrame that has multiple levels of indexing on one or both axes (rows and/or columns). This is useful for working with higher-dimensional data in a 2D structure.

Here the indices are:
 - Ticker
 - Date

In [43]:
# Path to the folder containing stock files
folder_path = './data'
sys.path.append(folder_path)

# Prepare a list to hold DataFrames
dataframes = []

# Loop through all files in the directory
for file in os.listdir(folder_path):
    if file.endswith('.us.txt'):
        ticker = file.split('.')[0]  # Extract 'aapl' from 'aapl.us.txt'
        file_path = os.path.join(folder_path, file)

        # Read the CSV
        df = pd.read_csv(file_path, parse_dates=['Date'])

        # Set index to Date
        df.set_index('Date', inplace=True)

        # Add a level to the index for the ticker
        df['Ticker'] = ticker
        df.set_index('Ticker', append=True, inplace=True)

        # Reorder MultiIndex: Ticker -> Date
        df = df.reorder_levels(['Ticker', 'Date'])

        dataframes.append(df)

# Combine all into one MultiIndexed DataFrame
combined_df = pd.concat(dataframes).sort_index()
combined_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,OpenInt
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
aapl,1984-09-07,0.42388,0.42902,0.41874,0.42388,23220030,0
aapl,1984-09-10,0.42388,0.42516,0.41366,0.42134,18022532,0
aapl,1984-09-11,0.42516,0.43668,0.42516,0.42902,42498199,0
aapl,1984-09-12,0.42902,0.43157,0.41618,0.41618,37125801,0
aapl,1984-09-13,0.43927,0.44052,0.43927,0.43927,57822062,0


# Data Cleaning

First we identify missing rows per ticker. Then interpolate data to fill them.

In [44]:
# Count missing values per column per ticker
missing_counts = combined_df.isnull().groupby(level='Ticker').sum()
missing_counts

Unnamed: 0_level_0,Open,High,Low,Close,Volume,OpenInt
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aapl,0,0,0,0,0,0
amzn,0,0,0,0,0,0
googl,0,0,0,0,0,0
msft,0,0,0,0,0,0
tsla,0,0,0,0,0,0


As we can see, there is no missing data so no need for cleaning.

Also the Date column is properly parsed when we loaded the data.

So we move forward to sorting and filtering the data to include only the **last 10 years**

In [45]:
combined_df = combined_df.sort_index()

# Define the date 10 years ago from today
cutoff_date = pd.Timestamp.today() - pd.DateOffset(years=10)

# Filter the DataFrame using the Date level
filtered_df = combined_df.loc[
    combined_df.index.get_level_values('Date') >= cutoff_date
]

filtered_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,OpenInt
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
aapl,2015-06-01,125.22,125.41,124.09,124.57,33649578,0
aapl,2015-06-02,123.92,124.68,123.43,124.0,35247319,0
aapl,2015-06-03,124.69,124.96,123.95,124.16,32466240,0
aapl,2015-06-04,123.68,124.61,123.03,123.47,40290158,0
aapl,2015-06-05,123.63,123.79,122.49,122.77,37331747,0


# Data Transformation

We calculate and add important informations from data.

They include:
 - **Daily Return**: Percentage change in the closing price from the previous day.
 - **7-Day Moving Average**: Average of the closing price over the past 7 trading days.
 - **30-Day Moving Average**: Average of the closing price over the past 30 trading days.
 - **Rolling Volatility(30d)**: Standard deviation of daily returns over the past 30 trading days.

Here I define a function to add all indicators at once, and use df.apply to apply them.

In [46]:
# Reset index to columns
temp_df = filtered_df.reset_index()

def add_indicators(df):
    df = df.sort_values('Date')
    df['Daily Return'] = df['Close'].pct_change()
    df['7MA'] = df['Close'].rolling(window=7).mean()
    df['30MA'] = df['Close'].rolling(window=30).mean()
    df['Rolling Volatility'] = df['Daily Return'].rolling(window=30).std()
    return df

# Apply on groups of Ticker
enhanced_temp = temp_df.groupby('Ticker').apply(add_indicators)

# Drop the extra Ticker level created by groupby + apply
enhanced_temp.reset_index(level=0, drop=True, inplace=True)

# Set MultiIndex again
enhanced_df = enhanced_temp.set_index(['Ticker', 'Date']).sort_index()

enhanced_df.head()

  enhanced_temp = temp_df.groupby('Ticker').apply(add_indicators)


Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,OpenInt,Daily Return,7MA,30MA,Rolling Volatility
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
aapl,2015-06-01,125.22,125.41,124.09,124.57,33649578,0,,,,
aapl,2015-06-02,123.92,124.68,123.43,124.0,35247319,0,-0.004576,,,
aapl,2015-06-03,124.69,124.96,123.95,124.16,32466240,0,0.00129,,,
aapl,2015-06-04,123.68,124.61,123.03,123.47,40290158,0,-0.005557,,,
aapl,2015-06-05,123.63,123.79,122.49,122.77,37331747,0,-0.005669,,,


# Analysis

Q1: Which stock had the highest average return over the 10-year period?

🧠 Approach:

    Use the Daily Return column.

    Group by Ticker, calculate the mean return.

In [47]:
average_returns = enhanced_df['Daily Return'].groupby('Ticker').mean()
highest_avg_return_ticker = average_returns.idxmax()
highest_avg_return_value = average_returns.max()

print(f"📈 Stock with highest average return: {highest_avg_return_ticker} ({highest_avg_return_value:.4%})")

📈 Stock with highest average return: amzn (0.1700%)


Q2: Which stock had the most volatile month, and when?

🧠 Approach:

    Use the Daily Return column.

    Group by Ticker and month (use Date level), compute monthly volatility.

    Find the group with highest standard deviation.

In [48]:
# Reset index to make Date accessible as a column
df_reset = enhanced_df.reset_index()

# Add 'YearMonth' column for monthly grouping
df_reset['YearMonth'] = df_reset['Date'].dt.to_period('M')

# Group by Ticker + YearMonth, calculate std of Daily Return
monthly_volatility = df_reset.groupby(['Ticker', 'YearMonth'])['Daily Return'].std()

# Find the max volatility
most_volatile = monthly_volatility.idxmax()
most_volatile_value = monthly_volatility.max()

print(f"⚠️ Most volatile month: {most_volatile[0]} in {most_volatile[1]} with std dev of {most_volatile_value:.4%}")

⚠️ Most volatile month: tsla in 2016-02 with std dev of 4.6379%
