# Frequency in IDs that Decreases $\alpha_R$

In [1]:
import csv
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

import matplotlib.patches as mpatches

from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression

from IPython.core.debugger import Pdb #Pdb().set_trace()
from collections import Counter


pd.set_option('display.max_columns', None)

coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']
path = "../data/"

In [2]:
random_sampling = pd.read_csv(path + "decreased_dataset_random_sampling_500_decreasing_points.csv")

In [3]:
df = random_sampling.describe().T

threshold = 2
min_subset_size = int(df[df['min'] < threshold].index[0].split("_")[-1])

amounts = random_sampling.columns[min_subset_size - 1:]
max_alpha_R = round((random_sampling[amounts].describe().T)["max"].sort_values().iloc[-2], 4)

random_sampling = random_sampling[random_sampling.columns[min_subset_size-1:]]
print(f"After sampling size of {min_subset_size}, all alpha-R in ramdon sampling will be < {max_alpha_R}")

After sampling size of 36, all alpha-R in ramdon sampling will be < 2.121


In [4]:
DB2 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"SELDB5_SVD.csv", low_memory=False)

# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

decreasing_ds = pd.read_csv(path+"decreasing_dataset_info.csv")
# Re-Introduce Dataset | What's new in DB5 that decreases αR
R_dec = DB5[DB5.id.isin(decreasing_ds.id)].reset_index(drop=True) 

R_dec["decreasing_pts"] = R_dec["id"].map(dict(zip(decreasing_ds.id, decreasing_ds.decreased)))
R_dec["decreasing_weights"] = R_dec["id"].map(dict(zip(decreasing_ds.id, decreasing_ds.weights)))

In [5]:
def get_regression(_R):
    """
    ASSUMING DATA IS ***NOT*** GIVEN IN LOG-SCALE
    """
    data = pd.concat([DB2, _R],
                     axis=0, 
                     ignore_index=True
                    )
    Y_ = data[["TAUTH"]].apply(np.log).to_numpy()
    # Adding a column for the intercept
    _df = data[coeffs].apply(np.abs).apply(np.log)
    _df.insert(
        loc = 0, 
        column = "intercept", 
        value = np.ones(len(_df))
    )
    X_ = _df.to_numpy()
    n_, p_ = X_.shape
    model = sm.OLS(Y_,X_)
    regression = model.fit()
    return data, regression, (n_,p_)

# Getting regression of DB2P8 only. 
empty_R = R_dec[R_dec.id.isin([0])]
regression_DB2= get_regression( empty_R )[1]
#regression_DB2.summary()

In [6]:
random_sampling = random_sampling[list(random_sampling.columns[:20]) + ["seed"]]

Unnamed: 0,subset_36,subset_37,subset_38,subset_39,subset_40,subset_41,subset_42,subset_43,subset_44,subset_45,subset_46,subset_47,subset_48,subset_49,subset_50,subset_51,subset_52,subset_53,subset_54,subset_55,seed
0,2.028032,2.025356,2.022786,2.013472,2.008839,2.005766,1.995925,1.991798,1.990475,1.984224,1.982490,1.975997,1.975889,1.973791,1.972872,1.968390,1.952411,1.951256,1.947754,1.946707,213
1,2.017517,2.015939,2.014711,2.005658,2.003267,1.999027,1.994245,1.986620,1.990786,1.990227,1.985166,1.978431,1.976444,1.965059,1.971155,1.968623,1.967685,1.965292,1.965124,1.953086,284
2,2.090829,2.088937,2.086592,2.081695,2.078387,2.073037,2.067668,2.061368,2.062682,2.059827,2.052184,2.050565,2.050083,2.048354,2.030702,2.029305,2.027858,2.026159,2.024165,2.019683,355
3,2.065293,2.054109,2.050571,2.044874,2.043964,2.042679,2.040852,2.040840,2.040281,2.039004,2.037557,2.036122,2.035300,2.032666,2.033159,2.027562,2.020492,2.019188,2.017046,2.012359,426
4,2.044564,2.041909,2.034898,2.029114,2.021986,2.016497,2.016938,2.012534,2.008716,2.003959,2.000021,1.999046,1.995700,1.991947,1.990510,1.987422,1.984696,1.983083,1.976380,1.964109,497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2.068017,2.060377,2.057548,2.051040,2.048085,2.042133,2.041262,2.035741,2.030439,2.027146,2.022091,2.020469,2.019316,2.018648,2.013816,2.010584,2.005482,2.002165,1.997429,1.992888,35358
496,2.049111,2.044571,2.043356,2.040397,2.037112,2.033909,2.031872,2.031171,2.019120,2.017292,2.014876,2.010789,2.009802,2.005812,2.001701,1.996424,1.995816,1.992973,1.987802,1.979778,35429
497,2.065951,2.063133,2.059942,2.047397,2.044678,2.037075,2.035657,2.033118,2.027810,2.016863,2.012884,2.012402,2.010263,2.008915,2.004864,1.998029,1.999239,1.993409,1.984331,1.981409,35500
498,2.042431,2.040236,2.021563,2.012045,2.008281,2.005584,2.003353,1.995537,1.992676,1.989637,1.985958,1.977652,1.973851,1.970594,1.969378,1.968328,1.965456,1.961027,1.958134,1.953599,35571


In [44]:
total_num_of_sampling = len(random_sampling.columns[:-1])
seeds = random_sampling.seed.values

folder = "IDs/alpha_two/"

```python
folder = "IDs/alpha_two/"

for i,s in enumerate(seeds):
    for j in range(total_num_of_sampling):
        amount = int(random_sampling.columns[j].split("_")[-1])
        R_sampled = R_dec.sample(n=amount, weights='decreasing_pts', random_state=s)
        # Getting diverse samples for alpha_R below 0.9
        alpha_R = get_regression( R_sampled )[1].params[5]
        IDs =  R_sampled.id.values

        with open(path+folder+f'IDs_alpha_({i}_{j})_{alpha_R}.csv', mode='w', newline='') as csv_file:
            # create a CSV writer object with the '|' delimiter
            writer = csv.writer(csv_file, delimiter='|')
            # write the header row
            writer.writerow(['ids'])

            for id_ in IDs:
                writer.writerow([id_])
```

```Python
# loop through each file in the folder
for filename in os.listdir(path+folder):
    file_path = os.path.join(path+folder, filename)
    # load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    # count the occurrences of each search string in the DataFrame
    for string in search_strings:
        count = df[df['ids'].str.contains(string, na=False)].shape[0]
        string_counts[string] += count

#df_ = pd.DataFrame([string_counts.keys(), string_counts.values()]).T.rename(columns={0:"id", 1:"frequency"})
#df_.to_csv(path+folder+"id_vs_frequency_decreasing_ds_ones.csv", index=False)
```

In [50]:
df_ = pd.read_csv(path+"id_vs_frequency_decreasing_ds.csv")

In [52]:
DB2[~DB2.id.isin(df_.id)]

Unnamed: 0,ind,id,PHASE,TOK,IP,BT,NEL,PLTH,RGEO,KAREA,EPS,MEFF,TAUTH,DATE,SHOT,TIME,Q95,ZEFF,AMIN,VOL,POHM,PNBI,DWDIA,DWMHD,PICRH,PECRH,PL,PFLOSS,TAV,LCOULOMB,QCYL5,TAUBOHM,RHOSTAR,BETASTAR,NUSTAR,OMEGACYCL,IP_error,BT_error,NEL_error,PLTH_error,POHM_error,PNBI_error,DWDIA_error,DWMHD_error,PICRH_error,PECRH_error,PL_error,PFLOSS_error,RGEO_error,AMIN_error,EPS_error,VOL_error,KAREA_error,MEFF_error
0,12405,HDULEH,HGELM,ASDEX,0.2959,2.205,3.789,1.870,1.694,0.9748,0.242090,1.5,0.05100,19820622,5980,1.205,4.563,0.0,0.4101,5.482,68660.0,2599000.0,212000.00,212000.0,0.0,0.0,2456000.0,585700.0,1086.986716,15.406717,3.606992,0.074970,0.006430,0.601125,0.223030,1.470000,"(0.292941, 0.298859)","(2.18295, 2.22705)","(3.6753299999999998, 3.9026699999999996)","(1.8138999999999998, 1.9261)","(34330.0, 102990.0)","(2.3391e6, 2.8589e6)","(169600.0, 254400.0)","(169600.0, 254400.0)","(0.0, 0.0)","(0.0, 0.0)","(2.20383e6, 2.70749e6)","(409990.0, 761410.0)","(1.67706, 1.71094)","(0.40394850000000004, 0.4162515)","(0.23609740844214294, 0.24820310543451038)","(5.31754, 5.64646)","(0.9087294891707897, 1.0453119485987046)","(1.35, 1.65)"
1,12406,NAC6N1,HGELM,ASDEX,0.2952,2.205,3.734,2.024,1.684,0.9724,0.244240,1.5,0.04902,19820622,5980,1.224,4.671,0.0,0.4113,5.468,22960.0,2598000.0,0.00,0.0,0.0,0.0,2621000.0,596700.0,1150.578521,15.470883,3.647604,0.072059,0.006596,0.627056,0.195418,1.470000,"(0.29224799999999995, 0.298152)","(2.18295, 2.22705)","(3.62198, 3.8460199999999998)","(1.9632800000000001, 2.08472)","(11480.0, 34440.0)","(2.3382e6, 2.8578e6)","(0.0, 0.0)","(0.0, 0.0)","(0.0, 0.0)","(0.0, 0.0)","(2.34968e6, 2.89224e6)","(417690.0, 775710.0)","(1.66716, 1.70084)","(0.4051305, 0.4174695)","(0.2381943627854472, 0.2504075793565105)","(5.30396, 5.63204)","(0.9064785558505044, 1.042722698966958)","(1.35, 1.65)"
2,12411,U2T1C7,HSELM,ASDEX,0.2971,2.205,3.410,1.132,1.693,0.9895,0.238807,1.5,0.06375,19820622,5982,1.188,4.275,0.0,0.4043,5.405,0.0,2590000.0,815400.00,942200.0,0.0,0.0,1690000.0,558000.0,927.006370,15.300207,3.544311,0.093713,0.006023,0.461374,0.274718,1.470000,"(0.294129, 0.300071)","(2.18295, 2.22705)","(3.3076999999999996, 3.5122999999999998)","(1.09804, 1.1659599999999999)","(0.0, 0.0)","(2.331e6, 2.849e6)","(652320.0, 978480.0)","(753760.0, 1.13064e6)","(0.0, 0.0)","(0.0, 0.0)","(1.6110533333333333e6, 1.76908e6)","(390600.0, 725400.0)","(1.6760700000000002, 1.70993)","(0.3982355, 0.41036449999999997)","(0.23289579105577424, 0.2448373277965717)","(5.242850000000001, 5.56715)","(0.9224010580433655, 1.0610383605495637)","(1.35, 1.65)"
3,12412,422XQB,HGELM,ASDEX,0.2959,2.205,3.775,1.376,1.693,0.9744,0.242351,1.5,0.06991,19820622,5982,1.216,4.574,0.0,0.4103,5.482,31630.0,2599000.0,683900.00,662500.0,0.0,0.0,1961000.0,584900.0,1100.512996,15.420935,3.611224,0.102768,0.006467,0.606357,0.216753,1.470000,"(0.292941, 0.298859)","(2.18295, 2.22705)","(3.66175, 3.8882499999999998)","(1.33472, 1.4172799999999999)","(15815.0, 47445.0)","(2.3391e6, 2.8589e6)","(547120.0, 820680.0)","(530000.0, 795000.0)","(0.0, 0.0)","(0.0, 0.0)","(1.8192083333333333e6, 2.102785e6)","(409430.0, 760370.0)","(1.6760700000000002, 1.70993)","(0.4041455, 0.4164545)","(0.23635207289187277, 0.24847082759073305)","(5.31754, 5.64646)","(0.9083800217459707, 1.044909956059492)","(1.35, 1.65)"
4,12413,WZ9FED,HGELM,ASDEX,0.2942,2.204,3.847,2.033,1.685,0.9691,0.244926,1.5,0.05151,19820622,5982,1.244,4.758,0.0,0.4127,5.490,37030.0,2600000.0,0.00,0.0,0.0,0.0,2637000.0,604000.0,1173.621999,15.475806,3.668909,0.075685,0.006642,0.659569,0.193993,1.469333,"(0.29125799999999996, 0.29714199999999996)","(2.18196, 2.2260400000000002)","(3.73159, 3.9624099999999998)","(1.9720099999999998, 2.09399)","(18515.0, 55545.0)","(2.34e6, 2.86e6)","(0.0, 0.0)","(0.0, 0.0)","(0.0, 0.0)","(0.0, 0.0)","(2.358515e6, 2.915545e6)","(422800.0, 785200.0)","(1.66815, 1.70185)","(0.4065095, 0.4188905)","(0.23886329582513147, 0.25111081137787367)","(5.3253, 5.6547)","(0.9034248586052129, 1.03921002962372)","(1.35, 1.65)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,24899,T6MZDY,HGELM,PDX,0.3778,1.104,3.936,1.659,1.400,1.0150,0.285714,2.0,0.02940,19830514,161413,0.440,1.873,0.0,0.4000,4.487,103400.0,2989000.0,54.38,216300.0,0.0,0.0,2876000.0,1217000.0,653.759009,14.879259,1.669964,0.016229,0.011791,1.498193,0.184595,0.552000,"(0.37402199999999997, 0.381578)","(1.0929600000000002, 1.11504)","(3.7392, 4.1328)","(1.60923, 1.7087700000000001)","(82720.0, 124080.0)","(2.6901e6, 3.2879e6)","(48.942, 59.818000000000005)","(194670.0, 237930.0)","(0.0, 0.0)","(0.0, 0.0)","(2.57815e6, 3.17405e6)","(851900.0, 1.5821e6)","(1.3895, 1.4104999999999999)","(0.388, 0.41200000000000003)","(0.2750797589507267, 0.29650953580424616)","(4.26265, 4.71135)","(0.9019498926510963, 1.1410209678269654)","(1.5, 2.5)"
1306,24901,WNR026,HGELM,PDX,0.3545,1.053,4.412,1.710,1.400,1.0150,0.300000,2.0,0.02936,19830514,161423,0.470,2.145,0.0,0.4200,4.949,149500.0,2655000.0,22.54,122700.0,0.0,0.0,2682000.0,972000.0,544.285607,14.638913,1.871413,0.015458,0.010743,1.536876,0.305907,0.526500,"(0.35095499999999996, 0.358045)","(1.04247, 1.0635299999999999)","(4.1914, 4.6326)","(1.6587, 1.7612999999999999)","(119600.0, 179400.0)","(2.3895e6, 2.9205e6)","(20.285999999999998, 24.794)","(110430.0, 134970.0)","(0.0, 0.0)","(0.0, 0.0)","(2.39867e6, 2.96493e6)","(680400.0, 1.2636e6)","(1.3895, 1.4104999999999999)","(0.4074, 0.4326)","(0.288833746898263, 0.31133501259445845)","(4.70155, 5.19645)","(0.9023295857936335, 1.141501302533477)","(1.5, 2.5)"
1307,24903,XKWAHN,HGELM,PDX,0.2767,1.051,4.208,1.442,1.400,1.0130,0.300000,2.0,0.02646,19830514,161426,0.470,2.859,0.0,0.4200,4.940,183000.0,2246000.0,77.93,127500.0,0.0,0.0,2302000.0,860000.0,434.364275,14.436993,2.393047,0.013905,0.009615,1.174242,0.577729,0.525500,"(0.273933, 0.279467)","(1.04049, 1.06151)","(3.9976000000000003, 4.4184)","(1.3987399999999999, 1.48526)","(146400.0, 219600.0)","(2.0214e6, 2.4706e6)","(70.137, 85.72300000000001)","(114750.0, 140250.0)","(0.0, 0.0)","(0.0, 0.0)","(2.05305e6, 2.54995e6)","(602000.0, 1.118e6)","(1.3895, 1.4104999999999999)","(0.4074, 0.4326)","(0.288833746898263, 0.31133501259445845)","(4.6930000000000005, 5.187)","(0.9006886550455748, 1.1394254262508339)","(1.5, 2.5)"
1308,24906,IP549N,HSELM,PDX,0.2543,1.051,3.673,1.132,1.400,1.0130,0.300000,2.0,0.02908,19830514,161430,0.470,3.138,0.0,0.4200,4.938,141700.0,1771000.0,50.68,13030.0,0.0,0.0,1900000.0,767600.0,429.698565,14.494183,2.603838,0.015282,0.009563,1.013941,0.562898,0.525500,"(0.25175699999999995, 0.256843)","(1.04049, 1.06151)","(3.48935, 3.85665)","(1.09804, 1.1659599999999999)","(113360.0, 170040.0)","(1.5939e6, 1.9481e6)","(45.612, 55.748)","(11727.0, 14333.0)","(0.0, 0.0)","(0.0, 0.0)","(1.695533e6, 2.103807e6)","(537320.0, 997880.0)","(1.3895, 1.4104999999999999)","(0.4074, 0.4326)","(0.288833746898263, 0.31133501259445845)","(4.6911, 5.1849)","(0.9003240037682283, 1.1389641204102465)","(1.5, 2.5)"


In [48]:
# Claramente no es el alpha_R m'as bajo ni el subset mas corto
# Pero es con el que mejor aprende el algoritmo de clasificacion

#_R = DB5[DB5.id.isin( df_[df_.frequency > 450].id )]
#_R = DB5[DB5.id.isin(df_[df_.frequency > 0].id)]
_R = DB5[DB5.id.isin(df_.id)]
get_regression(_R)[1].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.968
Model:,OLS,Adj. R-squared:,0.968
Method:,Least Squares,F-statistic:,5097.0
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,0.0
Time:,15:24:13,Log-Likelihood:,492.91
No. Observations:,1355,AIC:,-967.8
Df Residuals:,1346,BIC:,-920.9
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.8747,0.048,-59.379,0.000,-2.970,-2.780
x1,0.8583,0.026,32.760,0.000,0.807,0.910
x2,0.2689,0.028,9.663,0.000,0.214,0.324
x3,0.3515,0.020,17.542,0.000,0.312,0.391
x4,-0.6442,0.013,-47.730,0.000,-0.671,-0.618
x5,1.9930,0.050,40.217,0.000,1.896,2.090
x6,0.3335,0.042,7.910,0.000,0.251,0.416
x7,0.4439,0.053,8.396,0.000,0.340,0.548
x8,0.2019,0.031,6.461,0.000,0.141,0.263

0,1,2,3
Omnibus:,49.806,Durbin-Watson:,0.811
Prob(Omnibus):,0.0,Jarque-Bera (JB):,94.533
Skew:,-0.258,Prob(JB):,2.9700000000000003e-21
Kurtosis:,4.187,Cond. No.,49.2


In [49]:
#df_[df_.frequency > 450].to_csv(path+folder+ "R_ids_alpha_1.9930.csv", index=False)