# Data Analysis

## Prepare data

### Importing Libraries and Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

In [2]:
# read the csv file into a pandas dataframe
summary_df = pd.read_csv('../data/summary_df.csv', index_col=0)

# check the first 5 rows of the dataframe
summary_df.head()

Unnamed: 0_level_0,random,combinatory,std,complexity-std,loss-std,true-confusion,formula
Equation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
II.11.17,[ 7. 21. 15. 17. 21. 43. 41. 25. 14. 25. 11. 2...,[30. 35. 50. 28. 26. 25. 37. 31. 27. 38. 48. 3...,[22. 13. 17. 12. 13. 16. 12. 11. 14. 13. 17. 2...,[10. 18. 19. 11. 17. 19. 10. 19. 18. 6. 18. 1...,[16. 16. 20. 16. 14. 14. 14. 16. 16. 19. 7. 1...,[10. 10. 10. 8. 10. 11. 8. 9. 12. 7. 12. 1...,n_0*(1+p_d*Ef*cos(theta)/(kb*T))
I.24.6,[ nan 26. 115. nan nan 78. nan nan nan ...,[169. 29. 49. 93. 158. 227. 292. 43. 56. ...,[254. 84. 178. 166. 73. 197. 19. 135. 180. ...,[ 27. 136. 94. 102. 104. 127. 98. 18. 11. ...,[ 95. 92. 95. 95. 93. 99. 95. 163. 55. ...,[ 69. 71. 80. 63. 24. 47. 60. 110. nan ...,1/2*m*(omega**2+omega_0**2)*1/2*x**2
II.11.20,[27. 15. 34. 26. 20. 13. 18. 23. 15. 25. 25. 1...,[16. 16. 15. 15. 9. 18. 21. 13. 21. 13. 15. 1...,[17. 9. 18. 18. 12. 17. 14. 13. 7. 13. 12. 1...,[ 8. 8. 13. 93. 108. 105. 113. 150. 90. ...,[11. 15. 14. 12. 17. 15. 17. 15. 15. 13. 15. 1...,[14. 11. 10. 9. 10. 10. 13. 14. 9. 14. 10. 1...,n_rho*p_d**2*Ef/(3*kb*T)
III.13.18,[15. 30. 8. 7. 29. 10. 10. 11. 11. 13. nan 1...,[ 5. 9. 16. 16. 7. 13. 10. 16. 12. 16. 11. ...,[ 6. 13. 16. 10. 11. 10. 9. 8. 8. 10. 13. 1...,[ 5. 9. 7. 11. 15. 8. 17. 9. 13. 19. 10. 1...,[17. 16. 9. 13. 22. 10. 15. 9. 7. 12. 9. 1...,[ 8. 8. 9. 8. 10. 10. 8. 7. 8. 11. 11. 1...,2*E_n*d**2*k/(h/(2*pi))
I.13.4,[ 39. 17. 59. 205. 32. 51. 112. 265. 280. ...,[ 50. 9. 16. 21. 12. 25. 44. 48. 41. ...,[21. 45. 27. 54. 18. 24. 34. 39. 74. 27. 49. 3...,[ 18. 58. 18. 42. 32. 14. 144. 28. 43. ...,[ 97. 23. 23. 96. 94. 90. 87. 34. 78. ...,[26. 27. 15. 24. 32. 10. 28. 17. 30. 21. 38. 2...,1/2*m*(v**2+u**2+w**2)


Import/Create the trial summary

In [3]:
def trial_summary(save_to_file=True):
    """Creates a dictionary with the number of finished jobs and average last_n for each equation and algorithm"""
    # read the finished jobs file
    with open('../data/finished_jobs.json') as f:
        finished_jobs = json.load(f)
    # create a dictionary with the number of finished jobs and average last_n for each equation and algorithm
    trial_summary = {}
    for job in finished_jobs:
        # job[0] is the job name, job[1] is the parameters
        equation = job[0][0].split(" ")[1]
        algorithm = job[0][1]
        trial = job[0][2]
        last_n = job[1]["last_n"]
        try:
            converged = job[1]["converged"]
        except KeyError:
            if last_n <=499:
                converged = False
            else:
                converged = True
        # if the equation is not in the dictionary, add it
        if equation not in trial_summary:
            trial_summary[equation] = {}
        # if the algorithm is not in the dictionary, add it
        if algorithm not in trial_summary[equation]:
            trial_summary[equation][algorithm] = {}
        # add the last_n to the dictionary for the equation and algorithm if the job converged
        trial_summary[equation][algorithm][trial] = last_n if converged else None
    if save_to_file:
        with open('../data/trial_data.json', "w") as f:
            json.dump(trial_summary, f)
    return trial_summary

In [4]:
trial_summary_dict = trial_summary()

### Data Transformation & Cleaning

Rename the column "true-confusion" to "true-mod" to be consistent with the other dataframes.

In [5]:
# rename the column "true-confusion" to "true-mod"
summary_df.rename(columns={'true-confusion': 'true-mod'}, inplace=True)

Re-transform the dataframes to have the same format as the original dataframes.

In [6]:
def str_to_list(string):
    """Convert a string to a list of floats. Replace 'nan' with np.nan. Return a numpy array."""
    string = str(string).replace('[', '').replace(']', '').replace('\n',' ').split(' ')
    string = [x for x in string if x != '']
    string = [float(i) for i in string]
    # replace nan with np.nan
    string = [np.nan if x == 'nan' else x for x in string]
    # convert to numpy array
    string = np.array(string)
    return string

In [7]:
# convert all columns to list
for col in summary_df.columns[:-1]:
    summary_df[col] = summary_df[col].apply(str_to_list)

Filtering out the data to only include the columns we need for our analysis. The threshold of minimum number of samples is set as `treshold_value`.

In [8]:
threshold_value = 50 # set the minimum number of samples for the approach at a equation

# make a new dataframe with the length of each list
summary_df_len = summary_df.copy()
for col in summary_df_len.columns[:-1]:
    summary_df_len[col] = summary_df_len[col].apply(len)

# Define the columns to check
columns_to_check = ['random', 'combinatory','std','complexity-std','loss-std','true-mod']	

# Filter and print rows where at least one element in the specified columns is less than the threshold
filtered_rows = summary_df_len[(summary_df_len[columns_to_check] < threshold_value).any(axis=1)]
print(filtered_rows)

          random  combinatory  std  complexity-std  loss-std  true-mod  \
Equation                                                                 
I.24.6        42           98  100              44        12        90   
II.35.18      94          100   99              84        48        97   
II.21.32      93           17   36              19        25         1   
I.50.26        1            1    1               1        59        63   

                                             formula  
Equation                                              
I.24.6          1/2*m*(omega**2+omega_0**2)*1/2*x**2  
II.35.18  n_0/(exp(mom*B/(kb*T))+exp(-mom*B/(kb*T)))  
II.21.32                  q/(4*pi*epsilon*r*(1-v/c))  
I.50.26      x1*(cos(omega*t)+alpha*cos(omega*t)**2)  


In [9]:
# create a new dataframe without the filtered rows
summary_df_filtered = summary_df[~summary_df.index.isin(filtered_rows.index)]
# print the shape of the original and filtered dataframe
print(f"Original dataframe shape: {summary_df.shape}")
print(f"Filtered dataframe shape: {summary_df_filtered.shape}")

Original dataframe shape: (42, 7)
Filtered dataframe shape: (38, 7)


## Plot Data

### Correlation Plots

In [10]:
def correlation_plot_of_columns(df, col1, col2):
    """Plot the correlation between two columns of a dataframe."""
    # create a dataframe with the two columns
    df_corr = df[[col1, col2]]
    # drop rows with missing values
    df_corr = df_corr.dropna()
    # compute the correlation matrix
    corr = df_corr.corr()
    # plot the correlation matrix
    sns.heatmap(corr, annot=True, cmap='Blues')
    plt.show()

# plot the correlation between the columns 'random' and 'combinatory'
correlation_plot_of_columns(summary_df_filtered, 'random', 'combinatory')

ValueError: setting an array element with a sequence.