# Goal of the notebook

We need to filter the data to make the model perform better. Initially I planned to collect slightly more than 1 million observations, so that we will get approximately 1 million observations in the end. I guess this will be sufficient to construct a fairly good model.

In [42]:
import pandas as pd
import numpy as np

In [43]:
dataset_full = pd.read_csv('3_Clean_data_02_05.csv')
dataset_full

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Contract Name,Strike,Last Price,Bid,Ask,Change,% Change,Volume,Open Interest,Implied Volatility,Company,Date_get_data,Date_expiration,Time_to_maturity,r,Stock_price,Stock_volume
0,AAPL240223C00095000,95.0,86.97,87.1,87.6,0.00,-,1,7,1.7734,AAPL,2024-02-16,2024-02-23,0.019178,0.04,182.309998,49701400.0
1,AAPL240223C00100000,100.0,82.27,82.1,82.65,0.06,+0.07%,1,9,1.7344,AAPL,2024-02-16,2024-02-23,0.019178,0.04,182.309998,49701400.0
2,AAPL240223C00110000,110.0,83.50,72.1,72.65,0.00,-,-,1,1.4805,AAPL,2024-02-16,2024-02-23,0.019178,0.04,182.309998,49701400.0
3,AAPL240223C00115000,115.0,66.87,67.1,67.65,-2.18,-3.16%,1,2,1.3594,AAPL,2024-02-16,2024-02-23,0.019178,0.04,182.309998,49701400.0
4,AAPL240223C00120000,120.0,64.40,62.1,62.65,0.00,-,6,3,1.2461,AAPL,2024-02-16,2024-02-23,0.019178,0.04,182.309998,49701400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1223321,INTC261218C00050000,50.0,5.70,5.55,5.8,-0.80,-12.31%,42,1420,0.4266,INTC,2024-04-12,2026-12-18,2.684932,0.04,35.689999,80139400.0
1223322,INTC261218C00055000,55.0,4.80,4.7,4.8,-0.60,-11.11%,73,31727,0.4229,INTC,2024-04-12,2026-12-18,2.684932,0.04,35.689999,80139400.0
1223323,INTC261218C00060000,60.0,3.95,3.8,4.05,-0.57,-12.61%,68,2618,0.4229,INTC,2024-04-12,2026-12-18,2.684932,0.04,35.689999,80139400.0
1223324,INTC261218C00065000,65.0,3.30,2.52,3.35,-0.47,-12.47%,22,304,0.4189,INTC,2024-04-12,2026-12-18,2.684932,0.04,35.689999,80139400.0


In [44]:
def filter_contract_names(k, df=dataset_full):
    
    '''
    Leave only contract that we find at least k times
    '''
    contract_counts = df['Contract Name'].value_counts()
    valid_contract_names = contract_counts[contract_counts >= k].index
    filtered_df = df[df['Contract Name'].isin(valid_contract_names)]
    filtered_df.index = np.arange(0, len(filtered_df))
    return filtered_df

In [45]:
dataset_full_2 = filter_contract_names(11)
dataset_full_2

Unnamed: 0,Contract Name,Strike,Last Price,Bid,Ask,Change,% Change,Volume,Open Interest,Implied Volatility,Company,Date_get_data,Date_expiration,Time_to_maturity,r,Stock_price,Stock_volume
0,AAPL240301C00095000,95.0,93.48,87.2,87.95,0.00,-,-,1,1.5879,AAPL,2024-02-16,2024-03-01,0.038356,0.04,182.309998,49701400.0
1,AAPL240301C00100000,100.0,88.30,82.2,82.75,0.00,-,2,3,1.3789,AAPL,2024-02-16,2024-03-01,0.038356,0.04,182.309998,49701400.0
2,AAPL240301C00110000,110.0,84.86,72.25,72.75,0.00,-,-,4,1.2031,AAPL,2024-02-16,2024-03-01,0.038356,0.04,182.309998,49701400.0
3,AAPL240301C00135000,135.0,52.40,47.3,47.85,0.00,-,1,1,0.8027,AAPL,2024-02-16,2024-03-01,0.038356,0.04,182.309998,49701400.0
4,AAPL240301C00140000,140.0,44.95,42.3,42.85,0.00,-,4,21,0.7188,AAPL,2024-02-16,2024-03-01,0.038356,0.04,182.309998,49701400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1167986,INTC261218C00050000,50.0,5.70,5.55,5.8,-0.80,-12.31%,42,1420,0.4266,INTC,2024-04-12,2026-12-18,2.684932,0.04,35.689999,80139400.0
1167987,INTC261218C00055000,55.0,4.80,4.7,4.8,-0.60,-11.11%,73,31727,0.4229,INTC,2024-04-12,2026-12-18,2.684932,0.04,35.689999,80139400.0
1167988,INTC261218C00060000,60.0,3.95,3.8,4.05,-0.57,-12.61%,68,2618,0.4229,INTC,2024-04-12,2026-12-18,2.684932,0.04,35.689999,80139400.0
1167989,INTC261218C00065000,65.0,3.30,2.52,3.35,-0.47,-12.47%,22,304,0.4189,INTC,2024-04-12,2026-12-18,2.684932,0.04,35.689999,80139400.0


In [46]:
# If we leave only options that are represented at least 11 times in the dataset,
# We will lose less than 60 000 observations, which is equivalent to two days of collecting data
# However, we are now able to introduce 10 lag features for each option
len(dataset_full) - len(dataset_full_2)

55335

In [47]:
dataset_full_2.to_csv('4_Filtered_data_02_05.csv', index=None)

### Complete