In [17]:
import pandas as pd
import glob
import datetime

# Define the string that the file names must start with
file_filter = 'AAPL'

# Define the path to the directory containing the CSV files
path = '/Users/chrisjackson/OptionsApp/data/' + file_filter

# Use glob to get a list of all CSV files in the directory that start with the specified string
all_files = glob.glob(path + "/" + file_filter + "*.csv")

def add_sample_date(file_name):
    # Read the CSV file into a data frame
    df = pd.read_csv(file_name)
    # Add a column to the data frame containing the date of the sample
    df['sample_date'] = file_name[-16:-6]
    return df

# Combine all CSV files into a single data frame
df = pd.concat((add_sample_date(f) for f in all_files), ignore_index=True)

# Define a function to extract the expiration date from an OCC contract symbol
def extract_expiry_date(occ_symbol):
    year = int(occ_symbol[4:6])
    month = int(occ_symbol[6:8])
    day = int(occ_symbol[8:10])
    expiry_date = datetime.datetime(year + 2000, month, day)
    return expiry_date

# Define a function to convert the timestamp to a datetime object
def convert_timestamp(timestamp):
    if len(timestamp) > 8:
        timestamp = timestamp[-8:]        
    if timestamp[1] == ':':
        print(timestamp)
    hour = int(timestamp[0:2])
    minute = int(timestamp[3:5])
    second = int(timestamp[6:8])
    return datetime.time(hour, minute, second)

# Add a new column to the DataFrame with the expiry date
df['expiryDate'] = df['contractSymbol'].apply(extract_expiry_date)

# Convert the 'lastTradeDate' and 'timestamp' columns to datetime objects for future calculations
df['lastTradeDate'] = pd.to_datetime(df['lastTradeDate'])
# df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M:%S')
df['timestamp'] = df['timestamp'].apply(convert_timestamp)

# Write the combined data frame to a CSV file and show the number of rows written to the file
df.to_csv(path + "/" + file_filter + "_combined.csv", index=False)
df.info()


  df = pd.read_csv(file_name)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6115105 entries, 0 to 6115104
Data columns (total 17 columns):
 #   Column             Dtype         
---  ------             -----         
 0   contractSymbol     object        
 1   strike             float64       
 2   lastPrice          float64       
 3   change             float64       
 4   volume             float64       
 5   openInterest       float64       
 6   bid                float64       
 7   ask                float64       
 8   lastTradeDate      datetime64[ns]
 9   impliedVolatility  float64       
 10  inTheMoney         bool          
 11  timestamp          object        
 12  sample_date        object        
 13  currency           object        
 14  percentChange      float64       
 15  contractSize       object        
 16  expiryDate         datetime64[ns]
dtypes: bool(1), datetime64[ns](2), float64(9), object(5)
memory usage: 752.3+ MB


In [18]:
df.head()

Unnamed: 0,contractSymbol,strike,lastPrice,change,volume,openInterest,bid,ask,lastTradeDate,impliedVolatility,inTheMoney,timestamp,sample_date,currency,percentChange,contractSize,expiryDate
0,AAPL230428C00050000,50.0,115.05,0.0,2.0,2.0,115.35,115.45,2023-04-21 15:29:23,5.863284,True,10:42:40,2023-04-24,,,,2023-04-28
1,AAPL230428C00070000,70.0,95.78,0.0,0.0,1.0,95.3,95.95,2023-04-18 17:04:52,4.505864,True,10:42:40,2023-04-24,,,,2023-04-28
2,AAPL230428C00080000,80.0,85.85,0.0,0.0,1.0,85.3,85.55,2023-04-18 17:04:52,3.737305,True,10:42:40,2023-04-24,,,,2023-04-28
3,AAPL230428C00090000,90.0,65.68,0.0,2.0,2.0,74.3,76.05,2023-03-17 16:24:01,3.025393,True,10:42:40,2023-04-24,,,,2023-04-28
4,AAPL230428C00095000,95.0,70.0,0.0,5.0,9.0,70.3,70.75,2023-04-21 18:27:35,3.013674,True,10:42:40,2023-04-24,,,,2023-04-28


In [None]:
import matplotlib.pyplot as plt


# Sort the options by volume, and select the top 20%
top20 = df[df['volume'] >= df['volume'].quantile(0.8)]

# Group the options by their expiration date, and calculate the average and standard deviation of the price changes
grouped = top20.groupby('expiryDate')
mean_changes = grouped['change'].mean()
std_changes = grouped['change'].std()

# Flag any options whose price change is more than two standard deviations away from the mean
threshold = 2 * std_changes
flagged = (df['change'] - mean_changes[df['expiryDate']].values) > threshold[df['expiryDate']].values

# Create a scatter plot of the price changes, highlighting the flagged options
fig, ax = plt.subplots()
ax.scatter(df['expiryDate'], df['change'], alpha=0.5)
ax.scatter(df.loc[flagged, 'expiryDate'], df.loc[flagged, 'change'], c='red')
ax.set_xlabel('Expiration Date')
ax.set_ylabel('Price Change')
ax.set_title('Flagged Options')
plt.show()


In [None]:
# Group the options by their expiration date, and calculate the mean volume for each expiration date
grouped = df.groupby('expiryDate')['volume'].mean()

# Sort the expiration dates by volume in descending order, and select the top 3 expiration dates
top3 = grouped.sort_values(ascending=False).head(3)

# Filter the data frame to only include options with the top 3 expiration dates
df_top3 = df[df['expiryDate'].isin(top3.index)]

# Group the top 3 expiration dates by their expiration date, and calculate the average and standard deviation of the price changes
grouped_top3 = df_top3.groupby('expiryDate')
mean_changes = grouped_top3['change'].mean()
std_changes = grouped_top3['change'].std()

# Flag any options whose price change is more than two standard deviations away from the mean
threshold = 2 * std_changes
flagged = (df_top3['change'] - mean_changes[df_top3['expiryDate']].values) > threshold[df_top3['expiryDate']].values

# Create a scatter plot of the price changes for the top 3 expiration dates, highlighting the flagged options
fig, ax = plt.subplots(figsize=(15, 7.5))
for i, (name, group) in enumerate(df_top3.groupby('expiryDate')):
    ax.scatter(group['timestamp'], group['change'], alpha=0.5, label=name if i == 0 else None)
ax.scatter(df_top3.loc[flagged, 'timestamp'], df_top3.loc[flagged, 'change'], c='red')
ax.set_xlabel('Timestamp')
ax.set_ylabel('Price Change')
ax.set_title('Flagged Options (Top 3 Expiration Dates by Volume)')
ax.legend()
plt.show()
