# Frequency in IDs that Decreases $\alpha_R$

In [1]:
import csv
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

import matplotlib.patches as mpatches

from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression

from IPython.core.debugger import Pdb #Pdb().set_trace()
from collections import Counter


pd.set_option('display.max_columns', None)

coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']
path = "../data/"

In [2]:
random_sampling = pd.read_csv(path + "decreased_dataset_random_sampling_500_decreasing_points.csv")

In [3]:
df = random_sampling.describe().T

threshold = 2
min_subset_size = int(df[df['min'] < threshold].index[0].split("_")[-1])

amounts = random_sampling.columns[min_subset_size - 1:]
max_alpha_R = round((random_sampling[amounts].describe().T)["max"].sort_values().iloc[-2], 4)

random_sampling = random_sampling[random_sampling.columns[min_subset_size-1:]]
print(f"After sampling size of {min_subset_size}, all alpha-R in ramdon sampling will be < {max_alpha_R}")

After sampling size of 36, all alpha-R in ramdon sampling will be < 2.121


In [4]:
DB2P8 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"DB5.csv")
DB2 = DB2P8[DB5.columns] # Because DB2P8 has more columns than DB5

# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

decreasing_ds = pd.read_csv(path+"decreasing_dataset_info.csv")
# Re-Introduce Dataset | What's new in DB5 that decreases αR
R_dec = DB5[DB5.id.isin(decreasing_ds.id)].reset_index(drop=True) 

R_dec["decreasing_pts"] = R_dec["id"].map(dict(zip(decreasing_ds.id, decreasing_ds.decreased)))
R_dec["decreasing_weights"] = R_dec["id"].map(dict(zip(decreasing_ds.id, decreasing_ds.weights)))

In [5]:
def get_regression(_R):
    """
    ASSUMING DATA IS ***NOT*** GIVEN IN LOG-SCALE
    """
    data = pd.concat([DB2, _R],
                     axis=0, 
                     ignore_index=True
                    )
    Y_ = data[["TAUTH"]].apply(np.log).to_numpy()
    # Adding a column for the intercept
    _df = data[coeffs].apply(np.abs).apply(np.log)
    _df.insert(
        loc = 0, 
        column = "intercept", 
        value = np.ones(len(_df))
    )
    X_ = _df.to_numpy()
    n_, p_ = X_.shape
    model = sm.OLS(Y_,X_)
    regression = model.fit()
    return data, regression, (n_,p_)

# Getting regression of DB2P8 only. 
empty_R = R_dec[R_dec.id.isin([0])]
regression_DB2= get_regression( empty_R )[1]
#regression_DB2.summary()

In [6]:
random_sampling = random_sampling[list(random_sampling.columns[:20]) + ["seed"]]
random_sampling

Unnamed: 0,subset_36,subset_37,subset_38,subset_39,subset_40,subset_41,subset_42,subset_43,subset_44,subset_45,subset_46,subset_47,subset_48,subset_49,subset_50,subset_51,subset_52,subset_53,subset_54,subset_55,seed
0,2.028032,2.025356,2.022786,2.013472,2.008839,2.005766,1.995925,1.991798,1.990475,1.984224,1.982490,1.975997,1.975889,1.973791,1.972872,1.968390,1.952411,1.951256,1.947754,1.946707,213
1,2.017517,2.015939,2.014711,2.005658,2.003267,1.999027,1.994245,1.986620,1.990786,1.990227,1.985166,1.978431,1.976444,1.965059,1.971155,1.968623,1.967685,1.965292,1.965124,1.953086,284
2,2.090829,2.088937,2.086592,2.081695,2.078387,2.073037,2.067668,2.061368,2.062682,2.059827,2.052184,2.050565,2.050083,2.048354,2.030702,2.029305,2.027858,2.026159,2.024165,2.019683,355
3,2.065293,2.054109,2.050571,2.044874,2.043964,2.042679,2.040852,2.040840,2.040281,2.039004,2.037557,2.036122,2.035300,2.032666,2.033159,2.027562,2.020492,2.019188,2.017046,2.012359,426
4,2.044564,2.041909,2.034898,2.029114,2.021986,2.016497,2.016938,2.012534,2.008716,2.003959,2.000021,1.999046,1.995700,1.991947,1.990510,1.987422,1.984696,1.983083,1.976380,1.964109,497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2.068017,2.060377,2.057548,2.051040,2.048085,2.042133,2.041262,2.035741,2.030439,2.027146,2.022091,2.020469,2.019316,2.018648,2.013816,2.010584,2.005482,2.002165,1.997429,1.992888,35358
496,2.049111,2.044571,2.043356,2.040397,2.037112,2.033909,2.031872,2.031171,2.019120,2.017292,2.014876,2.010789,2.009802,2.005812,2.001701,1.996424,1.995816,1.992973,1.987802,1.979778,35429
497,2.065951,2.063133,2.059942,2.047397,2.044678,2.037075,2.035657,2.033118,2.027810,2.016863,2.012884,2.012402,2.010263,2.008915,2.004864,1.998029,1.999239,1.993409,1.984331,1.981409,35500
498,2.042431,2.040236,2.021563,2.012045,2.008281,2.005584,2.003353,1.995537,1.992676,1.989637,1.985958,1.977652,1.973851,1.970594,1.969378,1.968328,1.965456,1.961027,1.958134,1.953599,35571


In [7]:
total_num_of_sampling = len(random_sampling.columns[:-1])
seeds = random_sampling.seed.values

In [8]:
folder = "IDs/alpha_two/"

for i,s in enumerate(seeds):
    for j in range(total_num_of_sampling):
        amount = int(random_sampling.columns[j].split("_")[-1])
        R_sampled = R_dec.sample(n=amount, weights='decreasing_pts', random_state=s)
        # Getting diverse samples for alpha_R below 0.9
        alpha_R = get_regression( R_sampled )[1].params[5]
        IDs =  R_sampled.id.values

        with open(path+folder+f'IDs_alpha_({i}_{j})_{alpha_R}.csv', mode='w', newline='') as csv_file:
            # create a CSV writer object with the '|' delimiter
            writer = csv.writer(csv_file, delimiter='|')
            # write the header row
            writer.writerow(['ids'])

            for id_ in IDs:
                writer.writerow([id_])

```python
folder = "IDs/alpha_two/"

for i,s in enumerate(seeds):
    for j in range(total_num_of_sampling):
        amount = int(random_sampling.columns[j].split("_")[-1])
        R_sampled = R_dec.sample(n=amount, weights='decreasing_pts', random_state=s)
        # Getting diverse samples for alpha_R below 0.9
        alpha_R = get_regression( R_sampled )[1].params[5]
        IDs =  R_sampled.id.values

        with open(path+folder+f'IDs_alpha_({i}_{j})_{alpha_R}.csv', mode='w', newline='') as csv_file:
            # create a CSV writer object with the '|' delimiter
            writer = csv.writer(csv_file, delimiter='|')
            # write the header row
            writer.writerow(['ids'])

            for id_ in IDs:
                writer.writerow([id_])
```

In [9]:
search_strings = decreasing_ds.id.values
string_counts = {string: 0 for string in search_strings}

In [10]:
# loop through each file in the folder
for filename in os.listdir(path+folder):
    file_path = os.path.join(path+folder, filename)
    # load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    # count the occurrences of each search string in the DataFrame
    for string in search_strings:
        count = df[df['ids'].str.contains(string, na=False)].shape[0]
        string_counts[string] += count

In [11]:
df_ = pd.DataFrame([string_counts.keys(), string_counts.values()]).T.rename(columns={0:"id", 1:"frequency"})
df_.to_csv(path+folder+"id_vs_frequency_decreasing_ds_two.csv", index=False)

```Python
# loop through each file in the folder
for filename in os.listdir(path+folder):
    file_path = os.path.join(path+folder, filename)
    # load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    # count the occurrences of each search string in the DataFrame
    for string in search_strings:
        count = df[df['ids'].str.contains(string, na=False)].shape[0]
        string_counts[string] += count

#df_ = pd.DataFrame([string_counts.keys(), string_counts.values()]).T.rename(columns={0:"id", 1:"frequency"})
#df_.to_csv(path+folder+"id_vs_frequency_decreasing_ds_ones.csv", index=False)
```

In [13]:
df_ = pd.read_csv(path+folder+ "id_vs_frequency_decreasing_ds_two.csv")

In [14]:
df_[df_.frequency < 9000]["frequency"].unique()

array([289, 234, 231, 292, 314, 493, 343, 335, 339, 317, 148, 466, 329,
       351, 203, 344, 227, 288, 373, 401, 336, 330, 278, 341, 408, 319,
       356,   0, 299, 280, 321, 236, 380, 275, 417, 272, 326, 178, 323,
       353, 233, 359, 240, 128, 253, 352, 340, 310, 286, 298, 181, 209,
       237, 217, 443, 207, 220, 453, 242, 365, 327, 316, 235, 261, 333,
       267, 290, 229, 247, 251, 364, 271, 376, 303, 306, 302, 412, 442,
       252, 295, 239, 371, 218, 394, 249, 258, 384, 159, 381, 459, 248,
       273, 414, 338, 270, 345, 150, 483, 484, 287, 349, 435, 211, 283,
       254, 320, 418, 250, 350, 164, 308, 262, 195, 156, 141, 232, 468,
       451, 196, 188, 294, 172, 269, 226, 225, 214, 291, 410, 360, 361,
       311, 354, 374, 367, 346, 296, 391, 385, 415, 399, 246, 494, 263,
       213, 313, 212, 281, 210, 265, 304, 428, 427, 177, 200, 282, 322,
       325, 245, 382, 293, 301, 279, 219, 404, 244, 334, 366, 347, 198,
       439, 309, 154, 368, 307, 450, 331, 379, 348, 328, 312, 19

In [15]:
df_[df_.frequency > 0]

Unnamed: 0,id,frequency
0,AEWB71,289
1,UAAXGA,234
2,53LSPJ,231
3,FLO8WI,292
4,TKCDLW,314
...,...,...
2533,G8OILS,421
2537,ST5FL8,286
2538,G8PHEQ,276
2539,K3O76W,335


In [16]:
max(df_["frequency"])

539

In [22]:
min(df_[df_.frequency > 400]["frequency"])

401

In [26]:
# Claramente no es el alpha_R m'as bajo ni el subset mas corto
# Pero es con el que mejor aprende el algoritmo de clasificacion

_R = DB5[DB5.id.isin( df_[df_.frequency > 450].id )]
get_regression(_R)[1].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.968
Model:,OLS,Adj. R-squared:,0.968
Method:,Least Squares,F-statistic:,5097.0
Date:,"Wed, 29 Mar 2023",Prob (F-statistic):,0.0
Time:,19:48:23,Log-Likelihood:,492.91
No. Observations:,1355,AIC:,-967.8
Df Residuals:,1346,BIC:,-920.9
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.8747,0.048,-59.379,0.000,-2.970,-2.780
x1,0.8583,0.026,32.760,0.000,0.807,0.910
x2,0.2689,0.028,9.663,0.000,0.214,0.324
x3,0.3515,0.020,17.542,0.000,0.312,0.391
x4,-0.6442,0.013,-47.730,0.000,-0.671,-0.618
x5,1.9930,0.050,40.217,0.000,1.896,2.090
x6,0.3335,0.042,7.910,0.000,0.251,0.416
x7,0.4439,0.053,8.396,0.000,0.340,0.548
x8,0.2019,0.031,6.461,0.000,0.141,0.263

0,1,2,3
Omnibus:,49.806,Durbin-Watson:,0.811
Prob(Omnibus):,0.0,Jarque-Bera (JB):,94.533
Skew:,-0.258,Prob(JB):,2.9700000000000003e-21
Kurtosis:,4.187,Cond. No.,49.2


In [28]:
#df_[df_.frequency > 450].to_csv(path+folder+ "R_ids_alpha_1.9930.csv", index=False)