In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import math
from tqdm import tqdm
from multiprocessing import Process, Pool
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score



In [3]:
def convert_temporal_data(dataframe):
  '''
  Convert temporal variables from seconds to hours.

  This conversion is necessary. Otherwise, large predictor values (e.g., p^6)
  will be irrelevant for OLS.
  '''

  for column in ['p', 'r']:
    dataframe[column] = dataframe[column] / 3600

  return dataframe

def read_score_distribution(csv_file):
  columns = ['p', 'q', 'r', 'score']
  distribution = pd.read_csv(csv_file, names=columns,skiprows=1)
  distribution = convert_temporal_data(distribution)

  return distribution

In [4]:
def add_quadratic_predictors(dataframe):
  dataframe[['p2', 'q2', 'r2']] = dataframe[['p', 'q', 'r']]**2
  dataframe['pq'] = dataframe['p']*dataframe['q']

  return dataframe

def add_cubic_predictors(dataframe):
  dataframe[['p3', 'q3', 'r3']] = dataframe[['p', 'q', 'r']]**3
  dataframe['p2q'] = dataframe['p']**2 * dataframe['q']
  dataframe['pq2'] = dataframe['p'] * dataframe['q']**2

  return dataframe

def add_quartic_predictiors(dataframe):
  dataframe[['p4', 'q4', 'r4']] = dataframe[['p', 'q', 'r']]**4
  dataframe['p3q'] = dataframe['p']**3 * dataframe['q']
  dataframe['p2q2'] = dataframe['p']**2 * dataframe['q']**2
  dataframe['pq3'] = dataframe['p'] * dataframe['q']**3

  return dataframe

def add_quintic_predictors(dataframe):
  dataframe[['p5', 'q5', 'r5']] = dataframe[['p', 'q', 'r']]**5
  dataframe['p4q'] = dataframe['p']**4 * dataframe['q']
  dataframe['p3q2'] = dataframe['p']**3 * dataframe['q']**2
  dataframe['p2q3'] = dataframe['p']**2 * dataframe['q']**3
  dataframe['pq4'] = dataframe['p'] * dataframe['q']**4

  return dataframe

def add_sextic_predictors(dataframe):
  dataframe[['p6', 'q6', 'r6']] = dataframe[['p', 'q', 'r']]**6
  dataframe['p5q'] = dataframe['p']**5 * dataframe['q']
  dataframe['p4q2'] = dataframe['p']**4 * dataframe['q']**2
  dataframe['p3q3'] = dataframe['p']**3 * dataframe['q']**3
  dataframe['p2q4'] = dataframe['p']**2 * dataframe['q']**4
  dataframe['pq5'] = dataframe['p'] * dataframe['q']**5

  return dataframe
def only_att_predictors(dataframe):
    dataframe[['p2', 'q2', 'r2']] = dataframe[['p', 'q', 'r']]**2
    dataframe[['p3', 'q3', 'r3']] = dataframe[['p', 'q', 'r']]**3
    dataframe[['p4', 'q4', 'r4']] = dataframe[['p', 'q', 'r']]**4
    dataframe[['p5', 'q5', 'r5']] = dataframe[['p', 'q', 'r']]**5
    dataframe[['p6', 'q6', 'r6']] = dataframe[['p', 'q', 'r']]**6

    return dataframe
def only_wq_att_predictors(dataframe):
    dataframe[['p2', 'r2']] = dataframe[['p', 'r']]**2
    dataframe[['p3',  'r3']] = dataframe[['p', 'r']]**3
    dataframe[['p4',  'r4']] = dataframe[['p', 'r']]**4
    dataframe[['p5',  'r5']] = dataframe[['p', 'r']]**5
    dataframe[['p6',  'r6']] = dataframe[['p', 'r']]**6

    return dataframe
def only_lin_predictors(dataframe):
    dataframe['pq'] = dataframe['p']*dataframe['q']
    dataframe['p2q'] = dataframe['p']**2 * dataframe['q']
    dataframe['pq2'] = dataframe['p'] * dataframe['q']**2
    dataframe['p3q'] = dataframe['p']**3 * dataframe['q']
    dataframe['p2q2'] = dataframe['p']**2 * dataframe['q']**2
    dataframe['pq3'] = dataframe['p'] * dataframe['q']**3
    dataframe['p4q'] = dataframe['p']**4 * dataframe['q']
    dataframe['p3q2'] = dataframe['p']**3 * dataframe['q']**2
    dataframe['p2q3'] = dataframe['p']**2 * dataframe['q']**3
    dataframe['pq4'] = dataframe['p'] * dataframe['q']**4

    return dataframe
def only_lin_predictors_pr(dataframe):
    dataframe['pr'] = dataframe['p']*dataframe['r']
    dataframe['p2r'] = dataframe['p']**2 * dataframe['r']
    dataframe['pr2'] = dataframe['p'] * dataframe['r']**2
    dataframe['p3r'] = dataframe['p']**3 * dataframe['r']
    dataframe['p2r2'] = dataframe['p']**2 * dataframe['r']**2
    dataframe['p4r'] = dataframe['p']**4 * dataframe['r']
    dataframe['p3r2'] = dataframe['p']**3 * dataframe['r']**2
    dataframe['p2r3'] = dataframe['p']**2 * dataframe['r']**3
    dataframe['pr4'] = dataframe['p'] * dataframe['r']**4

    return dataframe

In [5]:
def create_quadratic_polynomial(dataframe):
  dataframe = add_quadratic_predictors(dataframe)
  return dataframe

def create_cubic_polynomial(dataframe):
  dataframe = add_cubic_predictors(
              add_quadratic_predictors(dataframe))
  return dataframe

def create_quartic_polynomial(dataframe):
  dataframe = add_quartic_predictiors(
              add_cubic_predictors(
              add_quadratic_predictors(dataframe)))
  return dataframe

def create_quintic_polynomial(dataframe):
  dataframe = add_quintic_predictors(
              add_quartic_predictiors(
              add_cubic_predictors(
              add_quadratic_predictors(dataframe))))
  return dataframe

def create_sextic_polynomial(dataframe):
  dataframe = add_sextic_predictors(
              add_quintic_predictors(
              add_quartic_predictiors(
              add_cubic_predictors(
              add_quadratic_predictors(dataframe)))))
  return dataframe

In [6]:
variables = ['p', 'q', 'r']
degre_max = 4

# Générer les combinaisons
possibilites = []
possibilites.append("sqrt(p)")
possibilites.append("sqrt(q)")
possibilites.append("sqrt(r)")
for degre in range(1, degre_max + 1):
    for variable in variables:
        possibilites.append(variable + str(degre))

for degre in range(1, degre_max + 1):
    for i in range(len(variables)):
        for j in range(i + 1, len(variables)):
            possibilites.append(variables[i] + str(degre) + variables[j])
            if degre!=1:
                possibilites.append(variables[j] + str(degre) + variables[i])



# Afficher les combinaisons
print (possibilites)

['sqrt(p)', 'sqrt(q)', 'sqrt(r)', 'p1', 'q1', 'r1', 'p2', 'q2', 'r2', 'p3', 'q3', 'r3', 'p4', 'q4', 'r4', 'p1q', 'p1r', 'q1r', 'p2q', 'q2p', 'p2r', 'r2p', 'q2r', 'r2q', 'p3q', 'q3p', 'p3r', 'r3p', 'q3r', 'r3q', 'p4q', 'q4p', 'p4r', 'r4p', 'q4r', 'r4q']


In [7]:
import copy
import random

history=[]
names=[]
def grid_search (dataframe, n,threshold):
    best_vif=[100000000000000]*20
    best_set=[[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
    comb=list(itertools.combinations(possibilites,n))
    if n>3:
        comb=random.sample(comb, 5000)
    for var_set in tqdm(comb, desc="Progression :" ):
        df_copy = pd.DataFrame()
        
        # Appliquer les transformations pour chaque variable dans l'ensemble
        for var in var_set:
            
            if var.startswith('sqrt'):
                
                root_var = var[-2]  # Récupérer la variable ('p', 'q' ou 'r') à partir de la variable sqrt
                df_copy[var] = dataframe[root_var].apply(math.sqrt)
                
            elif var.startswith(('p', 'q', 'r'))and len(var)==2:
                power = int(var[1:]) if len(var) > 1 else 1  # Récupérer l'exposant du polynôme
                base_var = var[0]  # Récupérer la variable ('p', 'q' ou 'r') à partir de la variable polynomiale
                df_copy[var] = dataframe[base_var] ** power
            elif var in [ 'p1q', 'p1r', 'q1r']:
                var_1, var_2 = var[0], var[2]  # Récupérer les deux variables ('p', 'q' ou 'r') de la combinaison
                df_copy[var] = dataframe[var_1]  * dataframe[var_2]
            elif var in ['p2q', 'q2p', 'p2r', 'r2p', 'q2r', 'r2q']:
                var_1, var_2 = var[0], var[2]  # Récupérer les deux variables ('p', 'q' ou 'r') de la combinaison
                df_copy[var] = (dataframe[var_1] **2) * dataframe[var_2]
            elif var in ['p3q', 'q3p', 'p3r', 'r3p', 'q3r', 'r3q']:
                var_1, var_2 = var[0], var[2]  # Récupérer les trois variables ('p', 'q' ou 'r') de la combinaison
                df_copy[var] = (dataframe[var_1]**3) * dataframe[var_2] 
            elif var in ['p4q', 'q4p', 'p4r', 'r4p', 'q4r', 'r4q']:
                var_1, var_2 = var[0], var[2] # Récupérer les quatre variables ('p', 'q' ou 'r') de la combinaison
                df_copy[var] = (dataframe[var_1]**4) * dataframe[var_2]
        res=compute_vif(df_copy)
        max=np.max(res['VIF'])
        history.append(max)
        if n==1 : 
            names.append(var_set[-1])
        else : 
            res= var_set[0]
            for var in var_set:
                if var != var_set[0]:
                    res += "-"+var
            names.append(res)
        b=False
        idx=0
        for best in best_vif:
            if max<best and max>=threshold:
                b=True
                break
            idx+=1
        if b==True:
            tmp_vif=best_vif.copy()
            tmp_set=best_set.copy()
            for i in range (idx+1,5):
                
                tmp_vif[i]=best_vif[i-1]
                tmp_set[i]=best_set[i-1]
            tmp_vif[idx]=max
            tmp_set[idx]=var_set
            best_vif=tmp_vif
            best_set=tmp_set
        
        
            
    with open("grid.csv", "a+") as res:
        res.write(str(n))
        res.write(',')
        res.write(str(best_vif))
        res.write(',')
        res.write(str(best_set))
        res.write('\n')
    print(best_vif)
    print(best_set)


In [8]:
def compute_vif(features):
    vif = pd.DataFrame()
    vif['VIF'] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]
    vif['feature'] = features.columns
    return vif

In [9]:
csv_file = "data/global_training_data_GA.csv"
raw_dist = read_score_distribution(csv_file)

In [10]:
features_label = ['p', 'q', 'r']
target_label = ["score"]
features = raw_dist[features_label]
target = raw_dist[target_label]
features

Unnamed: 0,p,q,r
0,0.009444,2,4.613333
1,0.002222,32,4.627500
2,0.049167,16,4.908333
3,0.014444,1,4.928889
4,2.546944,128,4.931667
...,...,...,...
81339,3.888611,32,19.506111
81340,0.280833,64,19.533889
81341,0.001111,1,83.753056
81342,0.018889,2,83.766111


In [11]:
df_tmp=pd.DataFrame()
df_tmp['pq']=features['p']*features['q']
df_tmp['qp']=features['q']*features['p']
print(df_tmp)
compute_vif(df_tmp)

               pq          qp
0        0.018889    0.018889
1        0.071111    0.071111
2        0.786667    0.786667
3        0.014444    0.014444
4      326.008889  326.008889
...           ...         ...
81339  124.435556  124.435556
81340   17.973333   17.973333
81341    0.001111    0.001111
81342    0.037778    0.037778
81343    0.179444    0.179444

[81344 rows x 2 columns]


  vif = 1. / (1. - r_squared_i)


Unnamed: 0,VIF,feature
0,inf,pq
1,inf,qp


In [12]:
compute_vif(features[['p','q','r']])

Unnamed: 0,VIF,feature
0,1.272781,p
1,1.252365,q
2,1.151622,r


In [13]:
print (target['score'])

0        0.34375
1        0.12500
2        0.43750
3        0.28125
4        0.78125
          ...   
81339    0.87500
81340    0.75000
81341    0.93750
81342    0.96875
81343    1.00000
Name: score, Length: 81344, dtype: float64


In [14]:
def grid_evaluation (dataframe,y,comb):
    hist=[]
    for var_set in comb :
        print(f'_______________________ {var_set} _______________________ ')
        df_copy = pd.DataFrame()
        for var in var_set:
                
                if var.startswith('sqrt'):
                    
                    root_var = var[-2]  # Récupérer la variable ('p', 'q' ou 'r') à partir de la variable sqrt
                    df_copy[var] = dataframe[root_var].apply(math.sqrt)
                    
                elif var.startswith(('p', 'q', 'r'))and len(var)==2:
                    power = int(var[1:]) if len(var) > 1 else 1  # Récupérer l'exposant du polynôme
                    base_var = var[0]  # Récupérer la variable ('p', 'q' ou 'r') à partir de la variable polynomiale
                    df_copy[var] = dataframe[base_var] ** power
                elif var in ['p1q', 'p1r', 'q1r']:
                    var_1, var_2 = var[0], var[2]  # Récupérer les deux variables ('p', 'q' ou 'r') de la combinaison
                    df_copy[var] = dataframe[var_1]  * dataframe[var_2]
                elif var in ['p2q', 'q2p', 'p2r', 'r2p', 'q2r', 'r2q']:
                    var_1, var_2 = var[0], var[2]  # Récupérer les deux variables ('p', 'q' ou 'r') de la combinaison
                    df_copy[var] = (dataframe[var_1] **2) * dataframe[var_2]
                elif var in ['p3q', 'q3p', 'p3r', 'r3p', 'q3r', 'r3q']:
                    var_1, var_2 = var[0], var[2]  # Récupérer les trois variables ('p', 'q' ou 'r') de la combinaison
                    df_copy[var] = (dataframe[var_1]**3) * dataframe[var_2] 
                elif var in ['p4q', 'q4p', 'p4r', 'r4p', 'q4r', 'r4q']:
                    var_1, var_2 = var[0], var[2] # Récupérer les quatre variables ('p', 'q' ou 'r') de la combinaison
                    df_copy[var] = (dataframe[var_1]**4) * dataframe[var_2]
        X_train, X_test, y_train, y_test = train_test_split(df_copy,y, test_size=0.3, random_state=42)
        model=LinearRegression().fit(X_train,y_train)
        scores = cross_val_score(model, X_train, y_train, cv=5)
        print('############ TRAINING MAE SCORE ################')

        print(mean_absolute_error(model.predict(X_train),y_train))
        print('############ TESTING MAE SCORE ################')
        mae_score=mean_absolute_error(model.predict(X_test),y_test)
        print(mean_absolute_error(model.predict(X_test),y_test))
        hist.append(mae_score)
    return hist
        


In [15]:

data=[['p1','q1','r1'],['r3','q2p','p3r'],['sqrt(p)','p1q','q1r'],['sqrt(p)','sqrt(r)','p1'],['q2r','p3q','p4q'],
      ['sqrt(p)', 'sqrt(r)', 'r1', 'q4'],['q1','q1r','q4p','q4r'],['sqrt(p)','q3','p1q','q3p'],['sqrt(p)','sqrt(r)','q1r','r2q'],['r2p','r2q','q4r','r4q']]
grid_evaluation(features,target,data)

_______________________ ['p1', 'q1', 'r1'] _______________________ 
############ TRAINING MAE SCORE ################
0.2164113967364472
############ TESTING MAE SCORE ################
0.21717980982772267
_______________________ ['r3', 'q2p', 'p3r'] _______________________ 
############ TRAINING MAE SCORE ################
0.24021855967960545
############ TESTING MAE SCORE ################
0.2409854268393933
_______________________ ['sqrt(p)', 'p1q', 'q1r'] _______________________ 
############ TRAINING MAE SCORE ################
0.2169714497051536
############ TESTING MAE SCORE ################
0.21693984120016557
_______________________ ['sqrt(p)', 'sqrt(r)', 'p1'] _______________________ 
############ TRAINING MAE SCORE ################
0.21075842925607471
############ TESTING MAE SCORE ################
0.21089234508925758
_______________________ ['q2r', 'p3q', 'p4q'] _______________________ 
############ TRAINING MAE SCORE ################
0.2439476858189707
############ TESTING MAE 

[0.21717980982772267,
 0.2409854268393933,
 0.21693984120016557,
 0.21089234508925758,
 0.243989158857399,
 0.20902869180559244,
 0.23105163006866702,
 0.21469804751448088,
 0.21222638918450404,
 0.24586482431455228]

In [16]:
df=[]

# Lire le fichier CSV
df.append(pd.read_csv('CTC-SP2_ACTUAL_22_10.csv'))
df.append(pd.read_csv('CTC-SP2_ESTIMATED_22_10.csv'))
df.append(pd.read_csv('SDSC-BLUE_ACTUAL_64_10.csv'))
df.append(pd.read_csv('SDSC-BLUE_ESTIMATED_64_10.csv'))
df.append(pd.read_csv('LUBLIN 256_ACTUAL_50_10.csv'))
df.append(pd.read_csv('LUBLIN 256_ESTIMATED_50_10.csv'))
title= ['CTC-SP2_ACTUAL_22','CTC-SP2_ESTIMATED_22','SDSC-BLUE_ACTUAL_64','SDSC-BLUE_ESTIMATED_64','LUBLIN 256_ACTUAL_50','LUBLIN 256_ESTIMATED_50']
names=["LIN","S3_V1_D3","S3_V2_D1","S3_V10_D1","S3_V10_D4","S4_V7_D1","S4_V4_D3","S4_V9_D2","S4_V3_D1","S4_V9_D4"]
# Créer une figure avec des sous-graphiques (2 lignes, 3 colonnes)
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(30, 20))
i=0
# Boucle sur chaque subplot
for ax in axes.flatten():
    ax.boxplot(df[i],labels=names)
    ax.set_title(title[i])
    ax.set_xlabel("Policies")
    ax.set_ylabel("AVGBSLD")
    i=i+1

# Ajuster les espaces entre les subplots
plt.tight_layout()

# Sauvegarder la figure dans les deux formats
#plt.savefig("../../images/Final_Tester_analysis_SER.pdf")
#plt.savefig("../../images/Final_Tester_analysis_SER.png")
# Afficher la figure
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'CTC-SP2_ACTUAL_22_10.csv'

In [13]:
import seaborn as sns
history=[]
names=[]
print("######################1#######################")
grid_search(features,4,1 )




######################1#######################


Progression :: 100%|██████████| 5000/5000 [05:21<00:00, 15.55it/s]

[1.0287870543011561, 1.0312455020718834, 1.031980693477386, 1.0321204244934132, 1.035751963309563, 1.0389485000788778, 1.051179673166523, 1.0536546065645966, 1.057073777164388, 1.0637667931174641, 1.0660656954563514, 1.0676151611546802, 1.074217990662081, 1.076908973557082, 1.090865460512765, 1.092145141306637, 1.0926148636639472, 1.123081305180854, 1.1233503155274416, 1.1385330621673222]
[('sqrt(r)', 'q4', 'p3r', 'r4q'), ('sqrt(r)', 'q3p', 'p4q', 'r4p'), ('sqrt(r)', 'q3p', 'p3r', 'r4p'), ('sqrt(r)', 'q4', 'p4q', 'r4p'), ('sqrt(r)', 'q2p', 'p3r', 'r4q'), ('sqrt(r)', 'q3', 'p4q', 'r4p'), ('sqrt(r)', 'q2p', 'p4q', 'r4q'), ('sqrt(p)', 'r3', 'p4r', 'q4r'), ('sqrt(r)', 'r3p', 'p4r', 'q4r'), ('sqrt(r)', 'p3q', 'q3r', 'r4p'), ('sqrt(p)', 'q3r', 'p4q', 'r4p'), ('sqrt(p)', 'r4', 'p3q', 'q3r'), ('sqrt(r)', 'p1q', 'r3q', 'p4r'), ('sqrt(r)', 'r4', 'q2r', 'p4r'), ('r1', 'p2q', 'p3r', 'r4p'), ('sqrt(r)', 'p2', 'q4', 'r4'), ('sqrt(p)', 'r3', 'q2r', 'p4r'), ('sqrt(r)', 'q3', 'r3', 'p3q'), ('r1', 'q2p'




In [14]:
history=[]
names=[]
print("######################2#######################")
grid_search(features,4,2 )


######################2#######################


Progression :: 100%|██████████| 5000/5000 [05:37<00:00, 14.80it/s]

[2.000059410727451, 2.000071618704905, 2.0013299291605047, 2.0014162432075886, 2.002515132154139, 2.006002127524046, 2.006423156141382, 2.0067932953780296, 2.0114034450178306, 2.012187650708728, 2.01287766250069, 2.012887781225941, 2.0232693479567243, 2.027048089294582, 2.029735969870951, 2.044632207518928, 2.075875261345582, 2.0760610540036697, 2.0869802959265313, 2.0901683300000538]
[('p1q', 'q1r', 'p3q', 'r4q'), ('p2', 'p1r', 'r2q', 'r3p'), ('p2q', 'p2r', 'r2p', 'q4p'), ('q1', 'r1', 'r2q', 'q4p'), ('sqrt(q)', 'p2', 'p1q', 'q1r'), ('q1', 'q3p', 'p4q', 'r4q'), ('p2', 'p2r', 'r2q', 'r4p'), ('p2', 'p2r', 'q4p', 'r4p'), ('sqrt(p)', 'q2', 'p3', 'p3r'), ('sqrt(q)', 'p1q', 'q1r', 'r2p'), ('r4', 'r3q', 'p4q', 'q4r'), ('r4', 'p2r', 'r3q', 'q4r'), ('p1', 'p3', 'p1r', 'r3q'), ('p4', 'r4', 'p4r', 'r4q'), ('p2', 'p2q', 'p3r', 'r4q'), ('q1', 'q2r', 'p3q', 'q4p'), ('sqrt(q)', 'r1', 'r3', 'r2p'), ('q1', 'r3p', 'q3r', 'r4q'), ('p3', 'p2q', 'p3r', 'q4r'), ('q1', 'p1r', 'q3p', 'p3r')]





In [15]:
history=[]
names=[]
print("######################3#######################")
grid_search(features,4,3 )

######################3#######################


Progression ::   0%|          | 0/5000 [00:00<?, ?it/s]

Progression :: 100%|██████████| 5000/5000 [05:58<00:00, 13.93it/s]

[3.003344562357446, 3.007621918971203, 3.007902292118798, 3.0084567214643867, 3.0085601555647004, 3.0190801273276504, 3.0267470963511873, 3.0279010555120562, 3.028379812973821, 3.0367562935729633, 3.0437686230512853, 3.045908482666762, 3.0802682517342768, 3.1168958838698813, 3.139336953730485, 3.140687966522219, 3.155259791346255, 3.160052853924712, 3.177995398850287, 3.25845313641869]
[('sqrt(p)', 'sqrt(r)', 'q1r', 'r2q'), ('sqrt(p)', 'q3', 'r3', 'p1q'), ('sqrt(p)', 'q3', 'p1q', 'q3r'), ('sqrt(p)', 'sqrt(q)', 'q2', 'q1r'), ('sqrt(q)', 'r1', 'p1r', 'p2r'), ('q2', 'p1q', 'p2r', 'r4p'), ('q1', 'p3', 'p1q', 'p4r'), ('q1', 'p3', 'q1r', 'q2p'), ('p3', 'q3', 'q2r', 'r3q'), ('r4', 'q1r', 'p2q', 'q4r'), ('q2', 'p2r', 'r3p', 'q4p'), ('q1', 'p1q', 'p2r', 'r3q'), ('q2p', 'p2r', 'r2q', 'q3r'), ('q2', 'q2r', 'r3q', 'p4r'), ('q1', 'p3', 'p1q', 'q4r'), ('q1', 'r1', 'p1q', 'r2q'), ('p3', 'q3', 'q1r', 'q4r'), ('r2', 'p1q', 'r3p', 'q4p'), ('r1', 'r2', 'q1r', 'r3q'), ('sqrt(q)', 'p2q', 'q2p', 'q3r')]





In [16]:
history=[]
names=[]
print("######################4#######################")
grid_search(features,4,4 )

######################4#######################


Progression :: 100%|██████████| 5000/5000 [06:33<00:00, 12.71it/s]

[4.002218561560183, 4.003369550936127, 4.006050936423689, 4.009117478535489, 4.016728354196551, 4.033750845718566, 4.0714642971831285, 4.08150538220368, 4.098770644493258, 4.101001329414588, 4.1139311892720904, 4.15310954744645, 4.164818978616155, 4.190444703163091, 4.21520426360896, 4.255175019014491, 4.268644235728073, 4.343412991472145, 4.394086524202578, 4.41223967664902]
[('q2', 'r2', 'p1q', 'q4p'), ('p2r', 'q2r', 'p4r', 'r4p'), ('q2', 'q2r', 'q3p', 'p3r'), ('q2', 'p2r', 'q2r', 'q3p'), ('q1', 'p1q', 'p3q', 'q4r'), ('sqrt(r)', 'q4', 'r2q', 'q3r'), ('q1r', 'r2p', 'p3q', 'r4p'), ('q1', 'q1r', 'q4p', 'q4r'), ('p1', 'q1', 'q3', 'q4r'), ('q3', 'p1r', 'p3q', 'q4p'), ('q2', 'q1r', 'q3p', 'p3r'), ('p2', 'p4', 'p4q', 'r4p'), ('r2', 'p1q', 'r2q', 'q3p'), ('q3', 'q1r', 'r2q', 'q4r'), ('r2', 'r4', 'r4p', 'q4r'), ('r2', 'p4', 'p1q', 'q3p'), ('p2', 'p4', 'p3r', 'r3p'), ('q3', 'r4', 'q3r', 'q4p'), ('q2', 'q2r', 'q3p', 'r3q'), ('p2', 'p4', 'p2r', 'r4q')]





In [17]:
print("######################5#######################")
grid_search(features,4,5 )

######################5#######################


Progression ::   0%|          | 0/5000 [00:00<?, ?it/s]

Progression :: 100%|██████████| 5000/5000 [06:24<00:00, 13.01it/s]

[5.002237612567747, 5.005769872036266, 5.014783122081443, 5.02607346171365, 5.027655468723662, 5.036020584907134, 5.043809961001894, 5.051758785544901, 5.141178907629399, 5.149305172655287, 5.184231614809574, 5.2433066357091525, 5.34387552306147, 5.466751174837127, 5.537472440637525, 5.54524923883312, 5.553962356505731, 5.600843607904874, 5.612642948785637, 5.683749634993166]
[('sqrt(q)', 'q1', 'p3', 'p3r'), ('sqrt(q)', 'q1', 'r4', 'p3r'), ('sqrt(p)', 'q1', 'q3', 'q1r'), ('sqrt(q)', 'q1', 'p3', 'r2p'), ('p1q', 'p1r', 'q3p', 'r3p'), ('sqrt(q)', 'q1', 'r2q', 'p4r'), ('q1', 'p2q', 'p2r', 'p3q'), ('p2', 'p4', 'p1r', 'r2q'), ('sqrt(q)', 'q2', 'p2q', 'p3q'), ('sqrt(q)', 'sqrt(r)', 'q1r', 'q3r'), ('sqrt(r)', 'r2', 'r4', 'r2q'), ('sqrt(q)', 'q1', 'p1r', 'p4r'), ('q2', 'p2q', 'r2p', 'q3p'), ('sqrt(q)', 'p1r', 'q1r', 'q3r'), ('p2q', 'p2r', 'r2p', 'r4p'), ('p2r', 'r2p', 'r3q', 'r4p'), ('sqrt(q)', 'q1', 'p2q', 'p4r'), ('q3', 'p2q', 'r3p', 'q4p'), ('sqrt(q)', 'q1', 'p3', 'q1r'), ('p1', 'p1r', 'r2p'




In [18]:
print("######################6#######################")
grid_search(features,4,6 )

######################6#######################


Progression :: 100%|██████████| 5000/5000 [05:30<00:00, 15.12it/s]

[6.031701941921291, 6.0331502146742535, 6.0570764125327585, 6.077922161028182, 6.107292396649223, 6.148434907067311, 6.173450906787373, 6.28435497983934, 6.348434225931716, 6.361620863269356, 6.376649509239091, 6.766794211324725, 6.793561040262742, 6.8153612943206365, 6.989431878416813, 7.154547133391449, 7.2077741761537135, 7.213174209732539, 7.42278641582371, 7.4521305134045255]
[('sqrt(q)', 'r2q', 'p4r', 'r4q'), ('sqrt(q)', 'p1', 'r2q', 'r4q'), ('sqrt(r)', 'r1', 'p1q', 'r2q'), ('sqrt(r)', 'p1', 'r1', 'q3'), ('q1', 'p1q', 'q3p', 'r4p'), ('sqrt(r)', 'r1', 'r3p', 'q4r'), ('p1r', 'r2q', 'r3p', 'r4q'), ('sqrt(q)', 'q1', 'q1r', 'r2q'), ('p1', 'p2q', 'q2p', 'p4q'), ('p1', 'p1q', 'p1r', 'q3p'), ('p1', 'p1r', 'r2p', 'p3r'), ('q4', 'p2r', 'p3r', 'r3q'), ('q1', 'q2', 'r4', 'q1r'), ('q1', 'p2r', 'p3r', 'q4p'), ('q1', 'q3', 'p4q', 'q4p'), ('q1r', 'r2p', 'q2r', 'r4q'), ('q1', 'p3', 'q1r', 'q2r'), ('p1', 'q1', 'q2', 'p4r'), ('q1', 'q2', 'r3', 'q4r'), ('sqrt(p)', 'p1', 'r2p', 'q4p')]





In [19]:
print("######################7#######################")
grid_search(features,4,7 )

######################7#######################


Progression :: 100%|██████████| 5000/5000 [05:43<00:00, 14.54it/s]

[7.026775777692158, 7.04161864285452, 7.048631895035768, 7.0672517543885425, 7.0676317672702975, 7.145511531251649, 7.156450856019236, 7.19374271824105, 7.237984647651886, 7.272132632859394, 7.337942927712147, 7.409376466950774, 7.426283343712163, 7.556677784135376, 7.556749038649533, 7.575737267331288, 7.577266478324633, 7.66930449600513, 7.890647544180766, 7.904312672721984]
[('q1', 'p2r', 'r2q', 'p3r'), ('r4', 'p2r', 'q3p', 'p3r'), ('sqrt(p)', 'sqrt(r)', 'r1', 'q4'), ('sqrt(p)', 'q1r', 'p2q', 'q2r'), ('sqrt(q)', 'p2r', 'p3r', 'r3q'), ('q1', 'q2', 'q3r', 'p4r'), ('q1', 'q4', 'p2q', 'q4p'), ('q1r', 'q2p', 'p2r', 'p3r'), ('r2', 'p2r', 'p3r', 'r4q'), ('p2', 'q2', 'p2r', 'p3r'), ('q1', 'r1', 'q2', 'r4'), ('p2', 'p3', 'q4', 'r3p'), ('p2', 'p3', 'q1r', 'r4p'), ('p2', 'q2', 'r2', 'p3'), ('q4', 'q1r', 'q2p', 'q2r'), ('sqrt(p)', 'p1', 'r2', 'q2p'), ('p1r', 'q1r', 'q2p', 'q2r'), ('p2', 'p3', 'r2q', 'q3p'), ('r2', 'r3', 'p3q', 'q3r'), ('r2', 'p3', 'r3', 'q2p')]





In [20]:
print("######################8#######################")
grid_search(features,4,8 )

######################8#######################


Progression :: 100%|██████████| 5000/5000 [05:30<00:00, 15.13it/s]

[8.002856869713938, 8.006638171407525, 8.008675707154966, 8.010176503600068, 8.01233197803621, 8.020608145616507, 8.040110243689492, 8.064086591307143, 8.128686843248348, 8.264540605794723, 8.33765431183443, 8.364882610193899, 8.471162553772746, 8.49647001399048, 8.50659363865125, 8.530863167663098, 8.56967837211058, 8.66443916586115, 8.815618155590611, 8.873138185975373]
[('sqrt(p)', 'q3', 'p1q', 'q3p'), ('r2', 'r3', 'p2r', 'p4q'), ('r2', 'r3', 'q4', 'p2r'), ('p2', 'p3', 'p2r', 'p3q'), ('sqrt(r)', 'p2', 'p3', 'r2p'), ('sqrt(p)', 'p1', 'q1r', 'q2r'), ('r2', 'r3', 'p1q', 'p2r'), ('q1', 'p2', 'p3', 'r2p'), ('sqrt(p)', 'q1', 'q2', 'r4'), ('r2', 'r3', 'r2q', 'q3r'), ('sqrt(r)', 'r1', 'q3', 'r4'), ('r2p', 'r3p', 'p4q', 'q4r'), ('p4', 'q4', 'r2p', 'r3p'), ('p2q', 'r2p', 'r3p', 'q4r'), ('sqrt(q)', 'p2', 'q1r', 'q2r'), ('r2q', 'r3p', 'q4r', 'r4q'), ('p2q', 'r2p', 'q2r', 'r3p'), ('p4', 'p1q', 'q2p', 'r4q'), ('q1r', 'q2p', 'r2p', 'r3p'), ('p2', 'p3', 'q2p', 'p4q')]





In [21]:
print("######################9#######################")
grid_search(features,4,9 )

######################9#######################


Progression :: 100%|██████████| 5000/5000 [05:31<00:00, 15.06it/s]

[9.003829095595474, 9.007678063880148, 9.024407680393098, 9.039962433019605, 9.040207888457898, 9.055692136709117, 9.05591038334009, 9.056009633408298, 9.06947371257529, 9.09922585488753, 9.099903094359679, 9.17367942217757, 9.216351737309482, 9.243044805728706, 9.414647356459627, 9.601968165133822, 9.714971210594964, 9.821971766523607, 9.824595819625468, 10.09213693000002]
[('p1q', 'q2p', 'p3r', 'p4q'), ('p1q', 'q2p', 'r2q', 'p4q'), ('sqrt(p)', 'p1', 'r3', 'p4r'), ('sqrt(p)', 'p1', 'q2r', 'p4r'), ('sqrt(q)', 'q1', 'p4', 'p1q'), ('p2q', 'p2r', 'p3q', 'q3p'), ('q3', 'p2q', 'q2p', 'r4p'), ('q3', 'r3', 'p2q', 'q2p'), ('r2', 'p3', 'r3', 'r3p'), ('p1', 'q4', 'p2r', 'p3r'), ('r2p', 'r2q', 'q4r', 'r4q'), ('sqrt(q)', 'p2', 'r2p', 'r3p'), ('p2', 'r2p', 'r3p', 'r3q'), ('p2', 'r3', 'r2p', 'r3p'), ('sqrt(p)', 'p1', 'q1', 'p1q'), ('q2', 'r3', 'p1q', 'p2q'), ('r1', 'r2q', 'p3q', 'p4q'), ('sqrt(p)', 'p1', 'r1', 'p4r'), ('r2p', 'p3r', 'r3p', 'p4q'), ('r2', 'q1r', 'p3q', 'p4q')]





In [22]:
print("######################10#######################")
grid_search(features,4,10 )

######################10#######################


Progression :: 100%|██████████| 5000/5000 [06:34<00:00, 12.67it/s]

[10.005968332383697, 10.020791523196658, 10.023161714139219, 10.032180441717477, 10.032302105097525, 10.03307161391912, 10.07887176196019, 10.231299243813217, 10.277805907688053, 10.559857871557048, 10.564186777763151, 10.913509524825292, 10.932194819195765, 11.204511242671323, 11.438876246380005, 11.457207507221822, 11.536361886434005, 11.735088128734642, 11.898676001617496, 11.957539639694714]
[('p2', 'p1q', 'q2p', 'p3r'), ('p2r', 'p3q', 'q3r', 'p4q'), ('p3q', 'p4q', 'q4r', 'r4q'), ('p2', 'p3q', 'r3p', 'p4q'), ('p2', 'r4', 'r2p', 'r3p'), ('r2', 'p1r', 'p3q', 'p4q'), ('q1', 'q2', 'p1r', 'q2p'), ('q2r', 'r2q', 'p3q', 'p4q'), ('sqrt(p)', 'p4', 'p2r', 'p3r'), ('q3', 'p4', 'p2q', 'q2p'), ('q1r', 'r2p', 'p3r', 'r3p'), ('r1', 'r2', 'r4', 'p4r'), ('q1', 'p3q', 'r3q', 'p4q'), ('p4', 'p3q', 'p3r', 'p4q'), ('q1', 'q2', 'r4', 'q3p'), ('sqrt(p)', 'sqrt(r)', 'p1', 'p3q'), ('sqrt(q)', 'p1q', 'q2p', 'p3r'), ('p4', 'r2q', 'r3p', 'r3q'), ('q2', 'q4', 'p2r', 'p3r'), ('p1', 'r2q', 'p3q', 'r3q')]





In [12]:
history=[]
names=[]
grid_search(features,2 )
plt.figure(figsize=(40, 10))
data=pd.DataFrame(zip(history,names),columns=['VIF','NAME'])
data=data.sort_values('VIF')

Progression :: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 630/630 [00:11<00:00, 56.43it/s]

[1.0000000687381108, 1.0000003845348913, 1.0000007016175474, 1.0000009714029523, 1.000001195608339]
[('p4', 'r4q'), ('p4', 'r3q'), ('r4', 'p4q'), ('p4q', 'r4q'), ('p4', 'r4')]





<Figure size 4000x1000 with 0 Axes>

In [29]:
print(9**(1/4))

1.7320508075688772


In [33]:
print(np.max(compute_vif(features)["VIF"]))

1.2727812286448872


In [119]:
compute_vif(features['p']*features['q'])


IndexError: tuple index out of range

In [21]:
p=[]
for i in range (2,3):
    p.append(Process(target=grid_search,args=(features,i)))
    p[i-2].start()
    
for proc in p:
    proc.join()
    
        

Progression: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 435/435 [00:35<00:00, 12.22it/s]


1.2862065211525044
('r3q', 'r4p')


In [None]:
res=only_lin_predictors_pr(features)
vif=compute_vif(res)
print (vif)
print("mean = ",np.mean(vif.VIF))

In [None]:
compute_vif(create_quadratic_polynomial(features))

In [None]:
compute_vif(create_cubic_polynomial(features))

In [None]:
compute_vif(create_quartic_polynomial(features))

In [None]:
compute_vif(create_quintic_polynomial(features))

In [None]:
compute_vif(features)