# Frequency in IDs that Decreases $\alpha_R$

In [1]:
import csv
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

import matplotlib.patches as mpatches

from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression

from IPython.core.debugger import Pdb #Pdb().set_trace()
from collections import Counter


pd.set_option('display.max_columns', None)

coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']
path = "../data/"

In [2]:
random_sampling = pd.read_csv(path + "decreased_dataset_random_sampling_500_decreasing_points.csv")

In [3]:
df = random_sampling.describe().T

threshold = 0.64
min_subset_size = int(df[df['min'] < threshold].index[0].split("_")[-1])

amounts = random_sampling.columns[min_subset_size - 1:]
max_alpha_R = round((random_sampling[amounts].describe().T)["max"].sort_values().iloc[-2], 4)

random_sampling = random_sampling[random_sampling.columns[min_subset_size-1:]]
print(f"After sampling size of {min_subset_size}, all alpha-R in ramdon sampling will be < {max_alpha_R}")

After sampling size of 1462, all alpha-R in ramdon sampling will be < 0.6718


In [4]:
DB2P8 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"DB5.csv")
DB2 = DB2P8[DB5.columns] # Because DB2P8 has more columns than DB5

# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

decreasing_ds = pd.read_csv(path+"decreasing_dataset_info.csv")
# Re-Introduce Dataset | What's new in DB5 that decreases αR
R_dec = DB5[DB5.id.isin(decreasing_ds.id)].reset_index(drop=True) 

R_dec["decreasing_pts"] = R_dec["id"].map(dict(zip(decreasing_ds.id, decreasing_ds.decreased)))
R_dec["decreasing_weights"] = R_dec["id"].map(dict(zip(decreasing_ds.id, decreasing_ds.weights)))

In [5]:
def get_regression(_R):
    """
    ASSUMING DATA IS ***NOT*** GIVEN IN LOG-SCALE
    """
    data = pd.concat([DB2, _R],
                     axis=0, 
                     ignore_index=True
                    )
    Y_ = data[["TAUTH"]].apply(np.log).to_numpy()
    # Adding a column for the intercept
    _df = data[coeffs].apply(np.abs).apply(np.log)
    _df.insert(
        loc = 0, 
        column = "intercept", 
        value = np.ones(len(_df))
    )
    X_ = _df.to_numpy()
    n_, p_ = X_.shape
    model = sm.OLS(Y_,X_)
    regression = model.fit()
    return data, regression, (n_,p_)

# Getting regression of DB2P8 only. 
empty_R = R_dec[R_dec.id.isin([0])]
regression_DB2= get_regression( empty_R )[1]
#regression_DB2.summary()

In [6]:
total_num_of_sampling = len(random_sampling.columns[:-1])
seeds = random_sampling.seed.values

```python
for i,s in enumerate(seeds):
    for j in range(total_num_of_sampling):
        amount = int(random_sampling.columns[j].split("_")[-1])
        R_sampled = R_dec.sample(n=amount, weights='decreasing_pts', random_state=s)
        # Getting diverse samples for alpha_R below 0.9
        alpha_R = get_regression( R_sampled )[1].params[5]
        IDs =  R_sampled.id.values
        
        with open(path+f'IDs/IDs_alpha_({i}_{j})_{alpha_R}.csv', mode='w', newline='') as csv_file:
            # create a CSV writer object with the '|' delimiter
            writer = csv.writer(csv_file, delimiter='|')
            # write the header row
            writer.writerow(['ids'])
            
            for id_ in IDs:
                writer.writerow([id_])
```

In [7]:
search_strings = decreasing_ds.id.values
string_counts = {string: 0 for string in search_strings}

```Python
# loop through each file in the folder
for filename in os.listdir(path+"IDs"):
    file_path = os.path.join(path+"IDs", filename)
    # load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    # count the occurrences of each search string in the DataFrame
    for string in search_strings:
        count = df[df['ids'].str.contains(string, na=False)].shape[0]
        string_counts[string] += count
        
df_ = pd.DataFrame([string_counts.keys(), string_counts.values()]).T.rename(columns={0:"id", 1:"frequency"})
#df_.to_csv(path+"id_vs_frequency_decreasing_ds.csv", index=False)
```

In [9]:
df_ = pd.read_csv(path + "id_vs_frequency_decreasing_ds.csv")

In [10]:
df_[df_.frequency < 9000]["frequency"].unique()

array([8297, 8302, 8320, 8288, 8251, 8312, 8292, 8305, 8298, 8331, 8290,
       8313, 8314, 8285, 8303, 8294, 8296, 8281, 8248, 8304, 8307, 8317,
       8283,    0, 8300, 8229, 8255, 8293, 8319, 8316, 8333, 8252, 8301,
       8287, 8308, 8310, 8311, 8309, 8321, 8318, 8306, 8275, 8274, 8295,
       8338, 8278, 8277, 8291, 8323, 8286, 8299, 8284, 8289, 8280, 8328,
       8262, 8282, 8315, 8268, 8276, 8279, 8324, 8272, 8267, 8271, 8327,
       8326, 8265, 8322, 8266, 8332, 8330, 8329, 8256, 8264, 8260, 8325,
       8258])

In [11]:
df_[df_.frequency > 0]

Unnamed: 0,id,frequency
0,AEWB71,8297
1,UAAXGA,8302
2,53LSPJ,8320
3,FLO8WI,8320
4,TKCDLW,8297
...,...,...
2533,G8OILS,8312
2537,ST5FL8,8317
2538,G8PHEQ,8300
2539,K3O76W,8279


In [12]:
max(df_["frequency"])

8338

In [13]:
min(df_[df_.frequency > 0]["frequency"])

8229