# Frequency in IDs that Decreases $\alpha_R$

In [1]:
import csv
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

import matplotlib.patches as mpatches

from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression

from IPython.core.debugger import Pdb #Pdb().set_trace()
from collections import Counter


pd.set_option('display.max_columns', None)

coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']
path = "../data/"

In [2]:
random_sampling = pd.read_csv(path + "decreased_dataset_random_sampling_500_decreasing_points.csv")

In [3]:
df = random_sampling.describe().T

threshold = 0.75
min_subset_size = int(df[df['min'] < threshold].index[0].split("_")[-1])

amounts = random_sampling.columns[min_subset_size - 1:]
max_alpha_R = round((random_sampling[amounts].describe().T)["max"].sort_values().iloc[-2], 4)

random_sampling = random_sampling[random_sampling.columns[min_subset_size-1:]]
print(f"After sampling size of {min_subset_size}, all alpha-R in ramdon sampling will be < {max_alpha_R}")

After sampling size of 1061, all alpha-R in ramdon sampling will be < 0.8631


In [4]:
DB2P8 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"DB5.csv")
DB2 = DB2P8[DB5.columns] # Because DB2P8 has more columns than DB5

# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

decreasing_ds = pd.read_csv(path+"decreasing_dataset_info.csv")
# Re-Introduce Dataset | What's new in DB5 that decreases αR
R_dec = DB5[DB5.id.isin(decreasing_ds.id)].reset_index(drop=True) 

R_dec["decreasing_pts"] = R_dec["id"].map(dict(zip(decreasing_ds.id, decreasing_ds.decreased)))
R_dec["decreasing_weights"] = R_dec["id"].map(dict(zip(decreasing_ds.id, decreasing_ds.weights)))

In [5]:
def get_regression(_R):
    """
    ASSUMING DATA IS ***NOT*** GIVEN IN LOG-SCALE
    """
    data = pd.concat([DB2, _R],
                     axis=0, 
                     ignore_index=True
                    )
    Y_ = data[["TAUTH"]].apply(np.log).to_numpy()
    # Adding a column for the intercept
    _df = data[coeffs].apply(np.abs).apply(np.log)
    _df.insert(
        loc = 0, 
        column = "intercept", 
        value = np.ones(len(_df))
    )
    X_ = _df.to_numpy()
    n_, p_ = X_.shape
    model = sm.OLS(Y_,X_)
    regression = model.fit()
    return data, regression, (n_,p_)

# Getting regression of DB2P8 only. 
empty_R = R_dec[R_dec.id.isin([0])]
regression_DB2= get_regression( empty_R )[1]
#regression_DB2.summary()

In [6]:
total_num_of_sampling = len(random_sampling.columns[:-1])
seeds = random_sampling.seed.values

In [7]:
for i,s in enumerate(seeds):
    for j in range(total_num_of_sampling):
        amount = int(random_sampling.columns[j].split("_")[-1])
        R_sampled = R_dec.sample(n=amount, weights='decreasing_pts', random_state=s)
        # Getting diverse samples for alpha_R below 0.9
        alpha_R = get_regression( R_sampled )[1].params[5]
        IDs = get_regression( R_sampled )[0].id.values 
        
        with open(path+f'IDs/IDs_alpha_({i+j})_{alpha_R}.csv', mode='w', newline='') as csv_file:
            # create a CSV writer object with the '|' delimiter
            writer = csv.writer(csv_file, delimiter='|')
            # write the header row
            writer.writerow(['ids'])
            
            for id_ in IDs:
                writer.writerow([id_])

```python
# open a CSV file for writing
with open(path+'alphas_with_ids.csv', mode='w', newline='') as csv_file:
    # create a CSV writer object with the '|' delimiter
    writer = csv.writer(csv_file, delimiter='|')
    # write the header row
    writer.writerow(['alpha', 'ids'])
    for i,s in enumerate(seeds):
        for j in range(total_num_of_sampling):
            amount = int(random_sampling.columns[j].split("_")[-1])
            R_sampled = R_dec.sample(n=amount, weights='decreasing_pts', random_state=s)
            # Getting diverse samples for alpha_R below 0.9
            ALPHAs = get_regression( R_sampled )[1].params[5] 
            IDs = get_regression( R_sampled )[0].id.values 
            # Storing results in CSV file
            writer.writerow([ALPHAs, IDs])

```

Diosito ya por favor :((((((((((((((

```python
df = pd.read_csv(path+"alphas_with_ids.csv", delimiter="|")
df.ids = df.ids.str.replace("(\[)|(\])|(\')","",regex=True).str.replace("\s",",",regex=True)
df.ids = df.ids.apply(lambda x: x.split(","))

list_ids = [0]
for l in df.ids:
    list_ids += l
list_ids = list_ids[1:]
                                        
labels, values = zip(*Counter(list_ids).items())
indexes = np.arange(len(labels))
width = 1

plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels)
plt.show()
```