In [13]:
import pandas as pd
import glob
import datetime

# Define the string that the file names must start with
file_filter = 'AAPL'

# Define the path to the directory containing the CSV files
path = '/Users/chrisjackson/OptionsApp/data/' + file_filter

# Use glob to get a list of all CSV files in the directory that start with the specified string
all_files = glob.glob(path + "/" + file_filter + "*.csv")

def add_sample_date(file_name):
    # Read the CSV file into a data frame
    df = pd.read_csv(file_name)
    # Add a column to the data frame containing the date of the sample
    df['sample_date'] = file_name[-16:-6]
    return df

# Combine all CSV files into a single data frame
df = pd.concat((add_sample_date(f) for f in all_files), ignore_index=True)

# Define a function to extract the expiration date from an OCC contract symbol
def extract_expiry_date(occ_symbol):
    year = int(occ_symbol[4:6])
    month = int(occ_symbol[6:8])
    day = int(occ_symbol[8:10])
    expiry_date = datetime.datetime(year + 2000, month, day)
    return expiry_date

# Define a function to convert the timestamp to a datetime object
def convert_timestamp(timestamp):
    if len(timestamp) > 8:
        timestamp = timestamp[-8:]        
    hour = int(timestamp[0:2])
    minute = int(timestamp[3:5])
    second = int(timestamp[6:8])
    return datetime.time(hour, minute, second)

# Add a new column to the DataFrame with the expiry date
df['expiryDate'] = df['contractSymbol'].apply(extract_expiry_date)

# Convert the 'lastTradeDate' and 'timestamp' columns to datetime objects for future calculations
df['lastTradeDate'] = pd.to_datetime(df['lastTradeDate'])

# df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M:%S')
df['timestamp'] = df['timestamp'].apply(convert_timestamp)

# Drop the currency column
df.drop('currency', axis=1, inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1223021 entries, 0 to 1223020
Data columns (total 16 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   contractSymbol     1223021 non-null  object        
 1   strike             1223021 non-null  float64       
 2   lastPrice          1223021 non-null  float64       
 3   change             1223021 non-null  float64       
 4   volume             1223021 non-null  float64       
 5   openInterest       1223021 non-null  float64       
 6   bid                1223021 non-null  float64       
 7   ask                1223021 non-null  float64       
 8   lastTradeDate      1223021 non-null  datetime64[ns]
 9   impliedVolatility  1223021 non-null  float64       
 10  inTheMoney         1223021 non-null  bool          
 11  timestamp          1223021 non-null  object        
 12  sample_date        1223021 non-null  object        
 13  percentChange      529261 n

In [14]:
# Need to sort this into a multi-index dataframe by sample date and contract symbol

# Set the multi-index
df.set_index(['sample_date', 'contractSymbol'], inplace=True)

# Sort by the timestamp
df = df.sort_values(by=['timestamp'])


In [15]:
df.index.levels

FrozenList([['2023-04-19', '2023-04-20', '2023-04-21', '2023-04-24', '2023-04-25', '2023-04-26', '2023-04-27', '2023-04-28', '2023-05-01', '2023-05-03', '2023-05-04', '2023-05-12', '2023-05-15', '2023-05-17', '2023-05-18', '2023-05-24', '2023-05-30', '2023-05-31', '2023-06-01'], ['AAPL230421C00050000', 'AAPL230421C00060000', 'AAPL230421C00065000', 'AAPL230421C00070000', 'AAPL230421C00075000', 'AAPL230421C00080000', 'AAPL230421C00085000', 'AAPL230421C00090000', 'AAPL230421C00095000', 'AAPL230421C00100000', 'AAPL230421C00105000', 'AAPL230421C00110000', 'AAPL230421C00115000', 'AAPL230421C00120000', 'AAPL230421C00125000', 'AAPL230421C00130000', 'AAPL230421C00135000', 'AAPL230421C00137000', 'AAPL230421C00138000', 'AAPL230421C00139000', 'AAPL230421C00140000', 'AAPL230421C00141000', 'AAPL230421C00142000', 'AAPL230421C00143000', 'AAPL230421C00144000', 'AAPL230421C00145000', 'AAPL230421C00146000', 'AAPL230421C00147000', 'AAPL230421C00148000', 'AAPL230421C00149000', 'AAPL230421C00150000', 'AAPL2

In [16]:
#Retreive the timestamp as a datetime object
def get_timestamp_breakdown(timestamp):
    hour = int(timestamp[0:2])
    minute = int(timestamp[3:5])
    second = int(timestamp[6:8])
    return (hour, minute, second)

In [12]:
# Get the value where the in the money column changes from False to True, pass in the first timestamp only for now
def in_the_money(df):
    # Assuming df is your DataFrame and 'column_name' is the column of interest
    df['previous_value'] = df['column_name'].shift(1)
    df['change_to_true'] = (df['previous_value'] == False) & (df['column_name'] == True)

    # Get the index of the first row where the value changed to True
    first_change_index = df['change_to_true'].idxmax()

    # Get the first row where the value changed to True
    first_change_row = df.loc[first_change_index]

    return first_change_row


In [11]:
import matplotlib.pyplot as plt

# Calculate the 80th percentile of volume for each day
volume_threshold = df.groupby(level=0)['volume'].quantile(0.9)

# For each day, select the options with volume greater than or equal to the threshold
top_volume_options = df[df.groupby('sample_date')['volume'].transform(lambda x: x >= volume_threshold[x.name])]

# Sort the DataFrame by sample_date and timestamp in descending order
top_volume_options.sort_values(['sample_date', 'timestamp'], ascending=[True, False], inplace=True)

# Group the options by their contract symbol, and calculate the average and standard deviation of the price changes
grouped = top_volume_options.groupby('contractSymbol').reset_index()
mean_changes = grouped['change'].mean()
std_changes = grouped['change'].std()

# Flag any options whose price change is more than two standard deviations away from the mean
threshold = 2 * std_changes
flagged = (df['change'] - mean_changes[df['expiryDate']].values) > threshold[df['expiryDate']].values

# Create a scatter plot of the price changes, highlighting the flagged options
fig, ax = plt.subplots()
ax.scatter(df['expiryDate'], df['change'], alpha=0.5)
ax.scatter(df.loc[flagged, 'expiryDate'], df.loc[flagged, 'change'], c='red')
ax.set_xlabel('Expiration Date')
ax.set_ylabel('Price Change')
ax.set_title('Flagged Options')
plt.show()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_volume_options.sort_values(['sample_date', 'timestamp'], ascending=[True, False], inplace=True)


AttributeError: 'DataFrameGroupBy' object has no attribute 'reset_index'

In [None]:
# Group the options by their expiration date, and calculate the mean volume for each expiration date
grouped = df.groupby('expiryDate')['volume'].mean()

# Sort the expiration dates by volume in descending order, and select the top 3 expiration dates
top3 = grouped.sort_values(ascending=False).head(3)

# Filter the data frame to only include options with the top 3 expiration dates
df_top3 = df[df['expiryDate'].isin(top3.index)]

# Group the top 3 expiration dates by their expiration date, and calculate the average and standard deviation of the price changes
grouped_top3 = df_top3.groupby('expiryDate')
mean_changes = grouped_top3['change'].mean()
std_changes = grouped_top3['change'].std()

# Flag any options whose price change is more than two standard deviations away from the mean
threshold = 2 * std_changes
flagged = (df_top3['change'] - mean_changes[df_top3['expiryDate']].values) > threshold[df_top3['expiryDate']].values

# Create a scatter plot of the price changes for the top 3 expiration dates, highlighting the flagged options
fig, ax = plt.subplots(figsize=(15, 7.5))
for i, (name, group) in enumerate(df_top3.groupby('expiryDate')):
    ax.scatter(group['timestamp'], group['change'], alpha=0.5, label=name if i == 0 else None)
ax.scatter(df_top3.loc[flagged, 'timestamp'], df_top3.loc[flagged, 'change'], c='red')
ax.set_xlabel('Timestamp')
ax.set_ylabel('Price Change')
ax.set_title('Flagged Options (Top 3 Expiration Dates by Volume)')
ax.legend()
plt.show()
