In [1]:
import warnings
warnings.simplefilter("ignore", category=UserWarning)
%run Data_and_catalogs_proyect.ipynb

In [2]:
%matplotlib inline


# Here i show the graphic with three parameters

In [3]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from ipywidgets import interact

def graficar_interactivo(df, eje_x, eje_y, Size, log_x, log_y, agrupar_alfabeticamente):
    df_temp = df.copy()

    if agrupar_alfabeticamente and eje_y == 'SpType_Kim':
        df_temp['Spectype_grouped'] = df_temp['SpType_Kim'].apply(
            lambda x: 'O' if isinstance(x, str) and x.startswith('O') else ('B' if isinstance(x, str) and x.startswith('B') else x)
        )
        eje_y = 'Spectype_grouped'

    if Size == "None":
        fig = px.scatter(
            df_temp, 
            x=eje_x, 
            y=eje_y,
            title=f"{eje_x} vs {eje_y}",
            labels={eje_x: eje_x, eje_y: eje_y},
            log_x=log_x,
            log_y=log_y,
            hover_name='Name'
        )
    else:
        df_limpio = df_temp.dropna(subset=[Size])
        fig = px.scatter(
            df_limpio, 
            x=eje_x, 
            y=eje_y, 
            size=Size,
            title=f"{eje_x} vs {eje_y} (Size: {Size})",
            labels={eje_x: eje_x, eje_y: eje_y, Size: Size},
            log_x=log_x,
            log_y=log_y,
            hover_name='Name'
        )
    
    fig.show()

eje_x_widget = widgets.Dropdown(
    options=df_final.columns.tolist(),
    value=df_final.columns[0],
    description='X axis:',
)

eje_y_widget = widgets.Dropdown(
    options=df_final.columns.tolist(),
    value=df_final.columns[1],
    description='Y axis:',
)

opciones_Size = ["None"] + [col for col in df_final.columns if pd.api.types.is_numeric_dtype(df_final[col])]
Size_widget = widgets.Dropdown(
    options=opciones_Size,
    value="None",
    description='Size:',
)

log_x_widget = widgets.Checkbox(
    value=False,
    description='Logarithmic X axis'
)

log_y_widget = widgets.Checkbox(
    value=False,
    description='Logarithmic Y axis'
)

agrupar_alfabeticamente_widget = widgets.Checkbox(
    value=False,
    description='Group O and B spectral types'
)

interact(
    graficar_interactivo, 
    df=widgets.fixed(df_final), 
    eje_x=eje_x_widget, 
    eje_y=eje_y_widget, 
    Size=Size_widget,
    log_x=log_x_widget,
    log_y=log_y_widget,
    agrupar_alfabeticamente=agrupar_alfabeticamente_widget
)


interactive(children=(Dropdown(description='X axis:', options=('Name', 'SpType_Kim', 'M*', 'Period', 'Eccentri…

<function __main__.graficar_interactivo(df, eje_x, eje_y, Size, log_x, log_y, agrupar_alfabeticamente)>

# Here I make the correlation matrix, positive/negative dendograms and also venn diagram for the selected class of HMXB's and parameters from Neumann and Fortin Catalog

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets
import venn
import scipy.cluster.hierarchy as sch
from scipy.spatial import distance as ssd
import warnings

width = 8
large = 4

class_column = 'Class'

def Venn_diagram(selected_columns, filtered_df):
    sets = {col: set(filtered_df.index[filtered_df[col].notna()]) for col in selected_columns}
    labels = venn.get_labels([sets[col] for col in selected_columns], fill=['number'])
    fig, ax = venn.venn6(labels, names=selected_columns)
    for text in ax.texts:
        text.set_fontsize(8)
    plt.title(f"Venn Diagram for {len(selected_columns)} variables")
    plt.show()
    print("\nIntersecciones de conjuntos:")
    for subset_label, indices in labels.items():
        subset = set.intersection(
            *[sets[col] if char == '1' else set() for col, char in zip(selected_columns, subset_label)]
        )

def correlation_matrix_all(selected_classes, selected_columns):
    if len(selected_columns) == 0:
        print("Por favor selecciona al menos una variable numérica.")
        return

    if len(selected_columns) < 2:
        print("Selecciona al menos dos variables para generar el dendrograma.")
        return

    if len(selected_classes) == 0:
        print("Por favor selecciona al menos una clase.")
        return
    
    numeric_columns = df_final.select_dtypes(include=[np.number]).columns.tolist()
    selected_columns = list(selected_columns)
    selected_columns = [col for col in selected_columns if col in numeric_columns]
    
    if len(selected_columns) > 0:
        filtered_df_final = df_final[df_final[class_column].isin(selected_classes)][selected_columns + ['Name']]
        filtered_complete = filtered_df_final.dropna(subset=selected_columns)
        filtered_partial = filtered_df_final[~filtered_df_final.index.isin(filtered_complete.index)]
        num_complete = len(filtered_complete)
        num_partial = len(filtered_partial)
        kendall_corr = filtered_complete[selected_columns].corr(method='kendall')
        fig, ax = plt.subplots(figsize=(width, large))
        sns.heatmap(kendall_corr, annot=True, cmap="coolwarm", cbar=True, center=0, ax=ax)
        ax.set_title(f"Matriz de Correlación (Kendall) - {', '.join(selected_classes)} \n({num_complete} objetos con valores completos)")
        plt.tight_layout()
        plt.show()
        print(f"Objetos con valores completos para los parámetros seleccionados ({num_complete} objetos):")
        print(filtered_complete[['Name'] + selected_columns])
        
        if len(selected_columns) >= 2 and len(selected_columns) <= 6:
            Venn_diagram(selected_columns, filtered_df_final)
        else:
            print("El diagrama de Venn solo es compatible con entre 2 y 6 variables.")
        
        plot_dendrogram(kendall_corr)
        plot_log_scale(filtered_complete)
        
    else:
        print("Por favor selecciona al menos una variable numérica.")

def plot_dendrogram(correlation_matrix):
    matriz_positiva = correlation_matrix[correlation_matrix > 0].fillna(0)
    distancia_positiva = 1 - matriz_positiva
    vinculos_positivos = sch.linkage(ssd.squareform(distancia_positiva), method='ward')

    plt.figure(figsize=(width, large))
    sch.dendrogram(vinculos_positivos, labels=matriz_positiva.columns, leaf_rotation=45, leaf_font_size=10)
    plt.title('Dendrogram for Positive Correlation')
    plt.show()

    matriz_negativa = correlation_matrix[correlation_matrix < 0].fillna(0)
    np.fill_diagonal(matriz_negativa.values, 1)
    distancia_negativa = 1 - np.abs(matriz_negativa)
    vinculos_negativos = sch.linkage(ssd.squareform(distancia_negativa), method='ward')

    plt.figure(figsize=(width, large))
    sch.dendrogram(vinculos_negativos, labels=matriz_negativa.columns, leaf_rotation=45, leaf_font_size=10)
    plt.title('Dendrogram for Negative Correlation')
    plt.show()

def plot_log_scale(filtered_df):
    filtered_df = filtered_df.dropna(subset=[ 'Period','Spin_period'])

    if 'Class' not in filtered_df.columns:
        filtered_df['Class'] = df_final['Class']
    
    clases = filtered_df['Class'].unique()
    colors = plt.cm.get_cmap('tab20', len(clases))
    markers = ['o', '^', 'x', '*', 's', 'D', 'p', 'h', 'v', '<', '>', 'X']
    plt.figure(figsize=(width, large))
    
    for i, cls in enumerate(clases):
        class_data = filtered_df[filtered_df['Class'] == cls]
        
        plt.scatter(np.log10(class_data['Period']),
                    np.log10(class_data['Spin_period']),
                    alpha=0.7,
                    color=colors(i),
                    marker=markers[i % len(markers)],
                    label=cls)

    plt.title('Log-Log Scale: Spin Period vs Period')
    plt.xlabel('Log(Spin Period)')
    plt.ylabel('Log(Period)')
    plt.legend(title="Classes")
    plt.grid(True)
    plt.show()

    
try:
    unique_classes = df_final[class_column].dropna().unique().tolist()
    numeric_columns = df_final.select_dtypes(include=[np.number]).columns.tolist()

    interact(
        correlation_matrix_all,
        selected_classes=widgets.SelectMultiple(
            options=unique_classes,
            value=unique_classes,
            description='Classes:',
            disabled=False
        ),
        selected_columns=widgets.SelectMultiple(
            options=numeric_columns,
            value=numeric_columns[:8],
            description='Parameters:',
            disabled=False
        )
    )
except NameError:
    print("Por favor define el DataFrame `df_final` antes de ejecutar este código.")


interactive(children=(SelectMultiple(description='Classes:', index=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …

# Now i make the correlation between two parameters

In [5]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from ipywidgets import interact, widgets

def kendall_correlation(param1, param2):
    if param1 not in df_final.columns or param2 not in df_final.columns:
        print(f"Error: uno o ambos parámetros seleccionados no existen en df_final.")
        return
    
    selected_data = df_final[[param1, param2]].dropna(subset=[param1, param2])

    if selected_data.empty:
        print("No hay registros con valores no nulos en ambas columnas.")
        return
    
    kendall_coef, _ = stats.kendalltau(selected_data[param1], selected_data[param2])
    
    print(f"Coeficiente de correlación de Kendall entre {param1} y {param2}: {kendall_coef:.4f}")

numeric_columns = df_final.select_dtypes(include=[np.number]).columns.tolist()

interact(
    kendall_correlation,
    param1=widgets.Select(options=numeric_columns, description='Parámetro 1:'),
    param2=widgets.Select(options=numeric_columns, description='Parámetro 2:')
)

interactive(children=(Select(description='Parámetro 1:', options=('M*', 'Period', 'Eccentricity', 'Spin_period…

<function __main__.kendall_correlation(param1, param2)>

# Here I request all systems where there are missing values in one (or more) column, except in the other ones.

In [6]:
df_final

Unnamed: 0,Name,SpType_Kim,M*,Period,Eccentricity,Spin_period,Distance,Class,SpType_Fortin,M_X,Teff,N_H,Max_Soft_Flux,Min_Soft_Flux,Max_Hard_Flux,Min_Hard_Flux,Mean_Soft_Flux,Mean_Hard_Flux,Hardness
0,IGR J00370+6122,BN0.7 Ib,22.0,15.66490,0.4800,674.0000,3401.0,sg,BN0.5II-III / BN0.7Ib,,15411.1,11.820,95.5227,0.1253,18.420,8.008,47.82400,13.2140,0.276305
1,gam Cas,B0.5IVpe,13.0,203.37100,0.2600,,,Be,B0.5IVpe,,,,274.0580,157.7170,83.440,83.440,215.88750,83.4400,0.386498
2,2S 0114+650,B1Iae,16.0,11.59830,0.1800,10008.0000,4475.0,sg,B1Iae,,14131.5,,130.8930,101.1000,155.400,155.400,115.99650,155.4000,1.339696
3,IGR J01363+6610,B1Ve,12.5,159.00000,,,5816.0,Be,B1Ve,,,,0.1174,0.1174,,,0.11740,,
4,RX J0146.9+6121,B1IIIe,9.6,330.00000,,1407.4000,2751.0,Be,B1III-Ve,,,3.908,105.5000,25.4422,,,65.47110,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,GRO J2058+42,O9.5-B0IV-Ve,18.0,55.00000,,195.2500,8861.0,Be,O9.5-B0IV-Ve,,31462.3,,167.9000,167.9000,,,167.90000,,
95,SAX J2103.5+4545,B0Ve,17.5,12.66536,0.4055,358.6100,6218.0,Be,B0Ve,,29372.9,28.000,111.6000,0.2226,97.955,68.150,55.91130,83.0525,1.485433
96,Cep X-4,B1-B2Ve,10.8,20.85000,,65.3508,7446.0,Be,B1-B2Ve,,,7.115,470.1000,0.6595,,,235.37975,,
97,1H 2202+501,Be,,,,,1116.0,Be,B3e,,,,,,,,,,


In [7]:
import pandas as pd
import numpy as np
from ipywidgets import interact, widgets

def correlation_matrix_all(selected_classes, selected_columns_nan, selected_columns_non_nan):
    if len(selected_columns_nan) == 0 and len(selected_columns_non_nan) == 0:
        print("Por favor selecciona al menos una columna con valores nulos o valores conocidos.")
        return

    if len(selected_classes) == 0:
        print("Por favor selecciona al menos una clase.")
        return
    
    numeric_columns = df_final.select_dtypes(include=[np.number]).columns.tolist()
    selected_columns_nan = list(selected_columns_nan)
    selected_columns_non_nan = list(selected_columns_non_nan)
    
    selected_columns_nan = [col for col in selected_columns_nan if col in numeric_columns]
    selected_columns_non_nan = [col for col in selected_columns_non_nan if col in numeric_columns]
    
    if len(selected_columns_nan) > 0 or len(selected_columns_non_nan) > 0:
        filtered_df_final = df_final[df_final[class_column].isin(selected_classes)]

        missing_data = filtered_df_final[filtered_df_final[selected_columns_nan].isnull().all(axis=1)]

        complete_data = filtered_df_final[filtered_df_final[selected_columns_non_nan].notnull().all(axis=1)]
        intersection_data = pd.merge(
            missing_data, 
            complete_data, 
            how='inner', 
            on='Name', 
            suffixes=('_nan', '_non_nan')
        )

        if intersection_data.empty:
            print("No hay sistemas que cumplan ambas condiciones (valores nulos en algunas columnas y completos en otras).")
        else:
            print("Sistemas que tienen valores nulos en las columnas seleccionadas y valores completos en las otras columnas:")
            display(intersection_data)
        
    else:
        print("Por favor selecciona al menos una columna con valores nulos o valores conocidos.")

unique_classes = [x for x in unique_classes if pd.notna(x)]
numeric_columns = [col for col in numeric_columns if pd.notnull(col)]

interact(
    correlation_matrix_all,
    selected_classes=widgets.SelectMultiple(
        options=unique_classes,
        value=unique_classes,
        description='Classes:',
        disabled=False
    ),
    selected_columns_nan=widgets.SelectMultiple(
        options=numeric_columns,
        value=[],
        description='NaNs:',
        disabled=False
    ),
    selected_columns_non_nan=widgets.SelectMultiple(
        options=numeric_columns,
        description='no NaNs:',
        disabled=False
    )
)


interactive(children=(SelectMultiple(description='Classes:', index=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …

<function __main__.correlation_matrix_all(selected_classes, selected_columns_nan, selected_columns_non_nan)>

# And now, i extrapolate the values from the V_infinity and the log_Mdot columns from the six prototypical HMXBs. 

The [Levenshtein distance](https://www.sciencedirect.com/topics/computer-science/levenshtein-distance#:~:text=The%20Levenshtein%20distance%20is%20a%20string%20metric%20for%20measuring%20the,one%20word%20into%20the%20other.) is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (i.e. insertions, deletions or substitutions) required to change one word into the other. For example, the Levenshtein distance between the words "kitten" and "sitting" is 3 because three operations are required:

- Replace 'k' with 's'
- Replace 'e' with 'i'
- Add 'g' at the end

In [8]:
import pandas as pd
import Levenshtein
import re

roman_to_arabic = {
    "III": "3",
    "II": "2",
    "I": "1",
    "IV": "4",
    "V": "5",
    "VI": "6",
    "VII": "7",
    "VIII": "8",
    "IX": "9",
    "X": "10"
}

def replace_roman_with_arabic(spectral_type):
    if isinstance(spectral_type, str):
        for roman, arabic in sorted(roman_to_arabic.items(), key=lambda x: -len(x[0])):
            spectral_type = spectral_type.replace(roman, arabic)
    return spectral_type

# prototypicalHMXB = ["O6Iafpe", "BN0.7Ib", "O8.5Ib-II(f)p", "B0.5Ia", "O9.5Vep", "B0IIIne"]

#prototypicalHMXB = [replace_roman_with_arabic(stype) for stype in prototypicalHMXB]
prototypicalHMXB = ["O6Ia", "B0.7Ib", "O8.5Ib-II", "B0.5Ia", "O9.5V", "B0III"]
print(prototypicalHMXB)

['O6Iafpe', 'BN0.7Ib', 'O8.5Ib-II(f)p', 'B0.5Ia', 'O9.5Vep', 'B0IIIne']


In [9]:
import pandas as pd
import Levenshtein


# Velocidad terminal (v∞/103 [km s−1]) y sus incertidumbres
v_infinity = [1.9, 1.1, 1.9, 0.8, 0.4, 1.5]  
v_plus = [0.1, 0.1, 0.1, 0.2, 0.1, 0.3]      
v_minus = [0.1, 0.2, 0.1, 0.1, 0.1, 0.3]     

# Tasa de pérdida de masa log M˙ [M☉ yr−1] y sus incertidumbres
log_Mdot = [-5.6, -7.5, -6.1, -6.5, -7.5, -8.5]  
log_Mdot_plus = [0.2, 0.1, 0.2, 0.2, 0.3, 0.5]   
log_Mdot_minus = [0.3, 0.2, 0.2, 0.2, 0.3, 0.5]   

def find_most_similar_type(companion_type, prototypicalHMXB):
    if isinstance(companion_type, str):
        companion_type = companion_type.replace(" ", "")
        prototypicalHMXB = [stype.replace(" ", "") for stype in prototypicalHMXB]
        
        distances = {stype: Levenshtein.distance(companion_type, stype) for stype in prototypicalHMXB}

        most_similar_type = min(distances, key=distances.get)
        
        return most_similar_type
    else:
        return None

df_new = df_final.copy()
#df_new['SpType_Kim'] = df_new['SpType_Kim'].apply(replace_roman_with_arabic)
df_new['Most_Similar_Type'] = df_new['SpType_Kim'].apply(lambda x: find_most_similar_type(x, prototypicalHMXB))

def get_velocity_and_mass_loss(type_similar):
    try:
        index = prototypicalHMXB.index(type_similar)
        
        v_inf = v_infinity[index]
        v_pl = v_plus[index]
        v_mn = v_minus[index]
        
        log_md = log_Mdot[index]
        log_md_pl = log_Mdot_plus[index]
        log_md_mn = log_Mdot_minus[index]
        
        return pd.Series([v_inf, v_pl, v_mn, log_md, log_md_pl, log_md_mn])
    except ValueError:
        return pd.Series([None, None, None, None, None, None])

df_new[['v_infinity', 'v_plus', 'v_minus', 'log_Mdot', 'log_Mdot_plus', 'log_Mdot_minus']] = df_new['Most_Similar_Type'].apply(lambda x: get_velocity_and_mass_loss(x))


df_new[['Name','SpType_Kim', 'Most_Similar_Type']]

Unnamed: 0,Name,SpType_Kim,Most_Similar_Type
0,IGR J00370+6122,BN0.7 Ib,BN0.7Ib
1,gam Cas,B0.5IVpe,B0.5Ia
2,2S 0114+650,B1Iae,O6Iafpe
3,IGR J01363+6610,B1Ve,B0.5Ia
4,RX J0146.9+6121,B1IIIe,B0IIIne
...,...,...,...
94,GRO J2058+42,O9.5-B0IV-Ve,O9.5Vep
95,SAX J2103.5+4545,B0Ve,B0.5Ia
96,Cep X-4,B1-B2Ve,B0IIIne
97,1H 2202+501,Be,B0.5Ia
