# Imputation method of spectra

## Preview

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import random
csv_file_path = '02-Base.csv'
df = pd.read_csv(csv_file_path)
df_spectra = df.iloc[:, 0:53]
albedo_column = df.iloc[:, 53]
df

## Final method - Spectra

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import random

# Set global font to DejaVu Serif and increase font sizes
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20

# ---------------------------
# Load Data
# ---------------------------
csv_file_path = '02-Base.csv'
df = pd.read_csv(csv_file_path)

# Assume spectral data are in the first 53 columns.
df_spectra = df.iloc[:, 0:53].copy()
total_cols = df_spectra.shape[1]  # typically 53
wavelengths = np.array([float(col) for col in df_spectra.columns])

# ---------------------------
# Load class labels from column "class_asteroid_sf".
# ---------------------------
if 'class_asteroid_sf' in df.columns:
    classes = df['class_asteroid_sf']
else:
    raise ValueError("No 'class_asteroid_sf' column found in the CSV data.")

# ---------------------------
# Imputation Parameters
# ---------------------------
overlap_points = 21   # number of points used for overlapping region
slope_weight = 1.0    # slope weight = 1 for all imputation

# ---------------------------
# Function to compute error metric for a candidate.
# Uses:
#   - MSE computed with average shift over the overlapping region.
#   - Slope difference computed after aligning using the "i point":
#       * For left-incomplete: use the first overlapping point.
#       * For right-incomplete: use the last overlapping point.
# ---------------------------
def compute_similarity_aligned(target, candidate, indices, slope_weight, side="left"):
    shift_avg = np.mean(target[indices] - candidate[indices])
    candidate_aligned_avg = candidate + shift_avg
    mse = np.mean((target[indices] - candidate_aligned_avg[indices])**2)
    
    if side == "left":
        shift_point = target[indices[0]] - candidate[indices[0]]
    else:
        shift_point = target[indices[-1]] - candidate[indices[-1]]
    candidate_aligned = candidate + shift_point
    if len(indices) > 1:
        target_slopes = np.diff(target[indices])
        candidate_slopes = np.diff(candidate_aligned[indices])
        slope_diff = np.mean(np.abs(target_slopes - candidate_slopes))
    else:
        slope_diff = 0.0
    total_error = mse + slope_weight * slope_diff
    return total_error

# ---------------------------
# Process all spectra and perform imputation for incomplete ones.
# We'll store the final imputed spectra in final_imputed_list.
# ---------------------------
final_imputed_list = [None] * len(df_spectra)

# Export a single PDF with one page per imputed (incomplete) spectrum.
pdf = PdfPages("all_imputedA.pdf")

for i in range(len(df_spectra)):
    sample_orig = df_spectra.iloc[i].values.astype(float)
    # Use original for candidate selection.
    sample = sample_orig.copy()
    # If spectrum is complete, leave it unchanged.
    if not (np.isnan(sample[0]) or np.isnan(sample[-1])):
        final_imputed_list[i] = sample
        continue

    final_imputed = sample_orig.copy()  # working copy for final imputation
    processed_left = False
    processed_right = False

    # ---- Left-side imputation ----
    if np.isnan(sample[0]):
        processed_left = True
        first_obs = np.where(~np.isnan(sample))[0][0]
        left_missing_indices = np.arange(0, first_obs)
        left_overlap_indices = np.arange(first_obs, min(first_obs + overlap_points, total_cols))
        
        # Find candidate spectra with complete data in left missing & overlapping regions,
        # and from the same class as the target.
        candidates_left = []
        for j in range(len(df_spectra)):
            if j == i:
                continue
            if classes[j] != classes[i]:
                continue
            cand = df_spectra.iloc[j].values.astype(float)
            if np.all(~np.isnan(cand[left_overlap_indices])) and np.all(~np.isnan(cand[left_missing_indices])):
                candidates_left.append((j, cand))
        
        # --- For targets of class "R", use all available candidates even if fewer than 10.
        if classes[i] == "R":
            if len(candidates_left) >= 1:
                candidates_with_error_left = []
                for cand_index, cand in candidates_left:
                    err = compute_similarity_aligned(sample_orig, cand, left_overlap_indices, slope_weight, side="left")
                    candidates_with_error_left.append((cand_index, cand, err))
                candidates_with_error_left.sort(key=lambda x: x[2])
                best_candidates_left = candidates_with_error_left  # use all available candidates
                imputed_left = []
                errors_left = []
                for cand_index, cand, err in best_candidates_left:
                    errors_left.append(err)
                    shift_point = sample_orig[left_overlap_indices[0]] - cand[left_overlap_indices[0]]
                    cand_aligned = cand + shift_point
                    imputed_left.append(cand_aligned[left_missing_indices])
                imputed_left = np.array(imputed_left)
                errors_left = np.array(errors_left)
                weights_left = 1.0 / (errors_left + 1e-6)
                weighted_imputed_left = np.average(imputed_left, axis=0, weights=weights_left)
                
                # Smoothing: blend the first four missing points with extrapolated trend.
                if len(left_overlap_indices) >= 2:
                    x0 = wavelengths[left_overlap_indices[0]]
                    x1 = wavelengths[left_overlap_indices[1]]
                    y0 = sample_orig[left_overlap_indices[0]]
                    y1 = sample_orig[left_overlap_indices[1]]
                    slope_left = (y1 - y0) / (x1 - x0)
                else:
                    slope_left = 0
                smoothed_left = weighted_imputed_left.copy()
                for idx_missing, m in enumerate(left_missing_indices):
                    d = first_obs - m  # distance from the boundary
                    if d == 1:
                        w = 0.5
                    elif d == 2:
                        w = 0.35
                    elif d == 3:
                        w = 0.25
                    elif d == 4:
                        w = 0.15
                    elif d == 5:
                        w = 0.1
                    elif d == 6:
                        w = 0.05
                    elif d == 7:
                        w = 0.0
                    else:
                        w = 0.0
                    if w > 0:
                        extrapolated_val = sample_orig[left_overlap_indices[0]] - slope_left * (x0 - wavelengths[m])
                        smoothed_left[idx_missing] = w * extrapolated_val + (1 - w) * weighted_imputed_left[idx_missing]
                weighted_imputed_left = smoothed_left
                
                final_imputed[left_missing_indices] = weighted_imputed_left
                
                # Plot left-side imputation.
                fig, ax = plt.subplots(figsize=(10,6))
                for cand_index, cand, err in best_candidates_left:
                    shift_point = sample_orig[left_overlap_indices[0]] - cand[left_overlap_indices[0]]
                    cand_aligned = cand + shift_point
                    ax.plot(wavelengths, cand_aligned, color='lightgray', linewidth=1, zorder=1)
                ax.plot(wavelengths, sample_orig, 'ko-', label="Target (Observed)", zorder=3)
                ax.plot(wavelengths, final_imputed, 'b--', label="Final Imputed Spectrum", zorder=4)
                ax.scatter(wavelengths[left_missing_indices], final_imputed[left_missing_indices], color='red', 
                           label="Imputed Points", zorder=5)
                ax.axvspan(wavelengths[left_missing_indices[0]], wavelengths[left_missing_indices[-1]], 
                           color='red', alpha=0.2, label="Missing Region", zorder=2)
                ax.set_xlabel("Wavelength (µm)")
                ax.set_ylabel("ln(Reflectance)")
                ax.legend()
                ax.text(0.5, -0.2, f"Target {i}: Left Imputation", transform=ax.transAxes, 
                        ha='center', va='center', fontsize=20)
                pdf.savefig(fig, bbox_inches='tight')
                plt.close()
        else:
            if len(candidates_left) < 10:
                pass  # Skip left-imputation if not enough candidates.
            else:
                candidates_with_error_left = []
                for cand_index, cand in candidates_left:
                    err = compute_similarity_aligned(sample_orig, cand, left_overlap_indices, slope_weight, side="left")
                    candidates_with_error_left.append((cand_index, cand, err))
                candidates_with_error_left.sort(key=lambda x: x[2])
                best_candidates_left = candidates_with_error_left[:10]
                
                imputed_left = []
                errors_left = []
                for cand_index, cand, err in best_candidates_left:
                    errors_left.append(err)
                    shift_point = sample_orig[left_overlap_indices[0]] - cand[left_overlap_indices[0]]
                    cand_aligned = cand + shift_point
                    imputed_left.append(cand_aligned[left_missing_indices])
                imputed_left = np.array(imputed_left)
                errors_left = np.array(errors_left)
                weights_left = 1.0 / (errors_left + 1e-6)
                weighted_imputed_left = np.average(imputed_left, axis=0, weights=weights_left)
                
                if len(left_overlap_indices) >= 2:
                    x0 = wavelengths[left_overlap_indices[0]]
                    x1 = wavelengths[left_overlap_indices[1]]
                    y0 = sample_orig[left_overlap_indices[0]]
                    y1 = sample_orig[left_overlap_indices[1]]
                    slope_left = (y1 - y0) / (x1 - x0)
                else:
                    slope_left = 0
                smoothed_left = weighted_imputed_left.copy()
                for idx_missing, m in enumerate(left_missing_indices):
                    d = first_obs - m
                    if d == 1:
                        w = 0.5
                    elif d == 2:
                        w = 0.35
                    elif d == 3:
                        w = 0.25
                    elif d == 4:
                        w = 0.15
                    elif d == 5:
                        w = 0.1
                    elif d == 6:
                        w = 0.05
                    elif d == 7:
                        w = 0.0
                    else:
                        w = 0.0
                    if w > 0:
                        extrapolated_val = sample_orig[left_overlap_indices[0]] - slope_left * (x0 - wavelengths[m])
                        smoothed_left[idx_missing] = w * extrapolated_val + (1 - w) * weighted_imputed_left[idx_missing]
                weighted_imputed_left = smoothed_left
                
                final_imputed[left_missing_indices] = weighted_imputed_left
                
                fig, ax = plt.subplots(figsize=(10,6))
                for cand_index, cand, err in best_candidates_left:
                    shift_point = sample_orig[left_overlap_indices[0]] - cand[left_overlap_indices[0]]
                    cand_aligned = cand + shift_point
                    ax.plot(wavelengths, cand_aligned, color='lightgray', linewidth=1, zorder=1)
                ax.plot(wavelengths, sample_orig, 'ko-', label="Target (Observed)", zorder=3)
                ax.plot(wavelengths, final_imputed, 'b--', label="Final Imputed Spectrum", zorder=4)
                ax.scatter(wavelengths[left_missing_indices], final_imputed[left_missing_indices], color='red', 
                           label="Imputed Points", zorder=5)
                ax.axvspan(wavelengths[left_missing_indices[0]], wavelengths[left_missing_indices[-1]], 
                           color='red', alpha=0.2, label="Missing Region", zorder=2)
                ax.set_xlabel("Wavelength (µm)")
                ax.set_ylabel("ln(Reflectance)")
                ax.legend()
                ax.text(0.5, -0.2, f"Target {i}: Left Imputation", transform=ax.transAxes, 
                        ha='center', va='center', fontsize=20)
                pdf.savefig(fig, bbox_inches='tight')
                plt.close()
                
    # ---- Right-side imputation ----
    if np.isnan(sample[-1]):
        processed_right = True
        last_obs = np.where(~np.isnan(sample))[0][-1]
        right_missing_indices = np.arange(last_obs + 1, total_cols)
        right_overlap_indices = np.arange(max(0, last_obs - overlap_points + 1), last_obs + 1)
        
        candidates_right = []
        for j in range(len(df_spectra)):
            if j == i:
                continue
            if classes[j] != classes[i]:
                continue
            cand = df_spectra.iloc[j].values.astype(float)
            if np.all(~np.isnan(cand[right_overlap_indices])) and np.all(~np.isnan(cand[right_missing_indices])):
                candidates_right.append((j, cand))
                
        if classes[i] == "R":
            if len(candidates_right) >= 1:
                candidates_with_error_right = []
                for cand_index, cand in candidates_right:
                    err = compute_similarity_aligned(sample_orig, cand, right_overlap_indices, slope_weight, side="right")
                    candidates_with_error_right.append((cand_index, cand, err))
                candidates_with_error_right.sort(key=lambda x: x[2])
                best_candidates_right = candidates_with_error_right  # use all available candidates
                imputed_right = []
                errors_right = []
                for cand_index, cand, err in best_candidates_right:
                    errors_right.append(err)
                    shift_point = sample_orig[right_overlap_indices[-1]] - cand[right_overlap_indices[-1]]
                    cand_aligned = cand + shift_point
                    imputed_right.append(cand_aligned[right_missing_indices])
                imputed_right = np.array(imputed_right)
                errors_right = np.array(errors_right)
                weights_right = 1.0 / (errors_right + 1e-6)
                weighted_imputed_right = np.average(imputed_right, axis=0, weights=weights_right)
                
                if len(right_overlap_indices) >= 2:
                    x0 = wavelengths[right_overlap_indices[-2]]
                    x1 = wavelengths[right_overlap_indices[-1]]
                    y0 = sample_orig[right_overlap_indices[-2]]
                    y1 = sample_orig[right_overlap_indices[-1]]
                    slope_right = (y1 - y0) / (x1 - x0)
                else:
                    slope_right = 0
                smoothed_right = weighted_imputed_right.copy()
                for idx_missing, m in enumerate(right_missing_indices):
                    d = m - last_obs  # distance from boundary
                    if d == 1:
                        w = 0.5
                    elif d == 2:
                        w = 0.35
                    elif d == 3:
                        w = 0.25
                    elif d == 4:
                        w = 0.15
                    elif d == 5:
                        w = 0.1
                    elif d == 6:
                        w = 0.05
                    elif d == 7:
                        w = 0.0
                    else:
                        w = 0.0
                    if w > 0:
                        extrapolated_val = sample_orig[right_overlap_indices[-1]] + slope_right * (wavelengths[m] - wavelengths[right_overlap_indices[-1]])
                        smoothed_right[idx_missing] = w * extrapolated_val + (1 - w) * weighted_imputed_right[idx_missing]
                weighted_imputed_right = smoothed_right
                final_imputed[right_missing_indices] = weighted_imputed_right
                
                fig, ax = plt.subplots(figsize=(10,6))
                for cand_index, cand, err in best_candidates_right:
                    shift_point = sample_orig[right_overlap_indices[-1]] - cand[right_overlap_indices[-1]]
                    cand_aligned = cand + shift_point
                    ax.plot(wavelengths, cand_aligned, color='lightgray', linewidth=1, zorder=1)
                ax.plot(wavelengths, sample_orig, 'ko-', label="Target (Observed)", zorder=3)
                ax.plot(wavelengths, final_imputed, 'b--', label="Final Imputed Spectrum", zorder=4)
                ax.scatter(wavelengths[right_missing_indices], final_imputed[right_missing_indices], color='red', 
                           label="Imputed Points", zorder=5)
                ax.axvspan(wavelengths[right_missing_indices[0]], wavelengths[right_missing_indices[-1]], 
                           color='red', alpha=0.2, label="Missing Region", zorder=2)
                ax.set_xlabel("Wavelength (µm)")
                ax.set_ylabel("ln(Reflectance)")
                ax.legend()
                ax.text(0.5, -0.2, f"Target {i}: Right Imputation", transform=ax.transAxes, 
                        ha='center', va='center', fontsize=20)
                pdf.savefig(fig, bbox_inches='tight')
                plt.close()
        else:
            if len(candidates_right) < 10:
                pass
            else:
                candidates_with_error_right = []
                for cand_index, cand in candidates_right:
                    err = compute_similarity_aligned(sample_orig, cand, right_overlap_indices, slope_weight, side="right")
                    candidates_with_error_right.append((cand_index, cand, err))
                candidates_with_error_right.sort(key=lambda x: x[2])
                best_candidates_right = candidates_with_error_right[:10]
                
                imputed_right = []
                errors_right = []
                for cand_index, cand, err in best_candidates_right:
                    errors_right.append(err)
                    shift_point = sample_orig[right_overlap_indices[-1]] - cand[right_overlap_indices[-1]]
                    cand_aligned = cand + shift_point
                    imputed_right.append(cand_aligned[right_missing_indices])
                imputed_right = np.array(imputed_right)
                errors_right = np.array(errors_right)
                weights_right = 1.0 / (errors_right + 1e-6)
                weighted_imputed_right = np.average(imputed_right, axis=0, weights=weights_right)
                
                if len(right_overlap_indices) >= 2:
                    x0 = wavelengths[right_overlap_indices[-2]]
                    x1 = wavelengths[right_overlap_indices[-1]]
                    y0 = sample_orig[right_overlap_indices[-2]]
                    y1 = sample_orig[right_overlap_indices[-1]]
                    slope_right = (y1 - y0) / (x1 - x0)
                else:
                    slope_right = 0
                smoothed_right = weighted_imputed_right.copy()
                for idx_missing, m in enumerate(right_missing_indices):
                    d = m - last_obs
                    if d == 1:
                        w = 0.5
                    elif d == 2:
                        w = 0.35
                    elif d == 3:
                        w = 0.25
                    elif d == 4:
                        w = 0.15
                    elif d == 5:
                        w = 0.1
                    elif d == 6:
                        w = 0.05
                    elif d == 7:
                        w = 0.0
                    else:
                        w = 0.0
                    if w > 0:
                        extrapolated_val = sample_orig[right_overlap_indices[-1]] + slope_right * (wavelengths[m] - wavelengths[right_overlap_indices[-1]])
                        smoothed_right[idx_missing] = w * extrapolated_val + (1 - w) * weighted_imputed_right[idx_missing]
                weighted_imputed_right = smoothed_right
                final_imputed[right_missing_indices] = weighted_imputed_right
                
                fig, ax = plt.subplots(figsize=(10,6))
                for cand_index, cand, err in best_candidates_right:
                    shift_point = sample_orig[right_overlap_indices[-1]] - cand[right_overlap_indices[-1]]
                    cand_aligned = cand + shift_point
                    ax.plot(wavelengths, cand_aligned, color='lightgray', linewidth=1, zorder=1)
                ax.plot(wavelengths, sample_orig, 'ko-', label="Target (Observed)", zorder=3)
                ax.plot(wavelengths, final_imputed, 'b--', label="Final Imputed Spectrum", zorder=4)
                ax.scatter(wavelengths[right_missing_indices], final_imputed[right_missing_indices], color='red', 
                           label="Imputed Points", zorder=5)
                ax.axvspan(wavelengths[right_missing_indices[0]], wavelengths[right_missing_indices[-1]], 
                           color='red', alpha=0.2, label="Missing Region", zorder=2)
                ax.set_xlabel("Wavelength (µm)")
                ax.set_ylabel("ln(Reflectance)")
                ax.legend()
                ax.text(0.5, -0.2, f"Target {i}: Right Imputation", transform=ax.transAxes, 
                        ha='center', va='center', fontsize=20)
                pdf.savefig(fig, bbox_inches='tight')
                plt.close()
    
    final_imputed_list[i] = final_imputed

pdf.close()

# ---------------------------
# Export final imputed spectra to CSV
# ---------------------------
df_imputed = pd.DataFrame(final_imputed_list, columns=df_spectra.columns, index=df_spectra.index)
df_imputed.to_csv("03-Base-imputedA.csv", index=False)

print("PDF of all imputed spectra exported to 'all_imputedA.pdf'.")
print("CSV of final imputed spectra exported to '03-Base-imputedA.csv'.")

## Final method - Albedo

In [None]:
import pandas as pd

# Load CSV
csv_file_path = '02-Base.csv'
df = pd.read_csv(csv_file_path)

# Identify missing pV values
missing_pV_data = df[df['pV'].isna()]
names_missing_pV = missing_pV_data['name'].unique()

# Fill missing values with mean pV of the same name
for name in names_missing_pV:
#    print(f"Name: {name} (Missing pV data)")
    all_entries_for_name = df[df['name'] == name]
    available_pV_data = all_entries_for_name[all_entries_for_name['pV'].notna()]
    
    if not available_pV_data.empty:
        mean_pV = available_pV_data['pV'].mean()
        df.loc[(df['name'] == name) & (df['pV'].isna()), 'pV'] = mean_pV
#        print(f"Filled missing 'pV' for {name} with mean value: {mean_pV:.4f}")
#    else:
#        print("No entries with available pV data.")
#    print("\n" + "-"*50 + "\n")

# Save the updated DataFrame with the same original columns
df.to_csv('03-Base-imputed3.csv', index=False)
print("Missing 'pV' values have been filled with mean values, and the updated CSV file '03-Base-imputed3.csv' has been saved.")

In [None]:
# Load the updated CSV and count remaining missing pV values
updated_df = pd.read_csv('03-Base-imputed3.csv')
remaining_missing_pV = updated_df['pV'].isna().sum()
print(f"Total remaining missing 'pV' values: {remaining_missing_pV}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Set global font to DejaVu Serif and increase font sizes
plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 20

# Load CSV
csv_file_path = '03-Base-imputed3.csv'
df = pd.read_csv(csv_file_path)

# Identify remaining missing pV values
remaining_missing_pV_data = df[df['pV'].isna()]
remaining_missing_pV_count = remaining_missing_pV_data.shape[0]
print(f"Total remaining missing 'pV' values: {remaining_missing_pV_count}")

# Fill missing pV values based on class
pdf_filename = 'pV_class_analysis.pdf'
with PdfPages(pdf_filename) as pdf:
    for classi in df['class_asteroid_sf'].unique():
        class_samples = df[df['class_asteroid_sf'] == classi]
        pV_values_class = class_samples['pV'].dropna()
        
        if not pV_values_class.empty:
            Q1 = pV_values_class.quantile(0.25)
            Q2 = pV_values_class.quantile(0.50)  # Median (Q2)
            Q3 = pV_values_class.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            is_outlier = (pV_values_class < lower_bound) | (pV_values_class > upper_bound)
            weights = np.where(is_outlier, 0.1, 1.0)
            weighted_mean_pV_class = np.average(pV_values_class, weights=weights)
            
            missing_pV_indices = (df['class_asteroid_sf'] == classi) & (df['pV'].isna())
            df.loc[missing_pV_indices, 'pV'] = weighted_mean_pV_class
            
            # Generate plot
            plt.figure(figsize=(10, 4))  # Decreased height to shrink the plot on y-axis
            box = plt.boxplot(pV_values_class, vert=False, patch_artist=True, showfliers=True, 
                              flierprops=dict(marker='o', color='red', alpha=0.5), widths=0.3)  # Further shrinking boxplot height
            
            # Increase thickness of Q2 (median) line
            for median in box['medians']:
                median.set(linewidth=3.5, color='orange')
            
            plt.axvline(x=lower_bound, color='blue', linestyle='--', label=f'Lower bound (Outlier): {lower_bound:.2f}', linewidth=3)
            plt.axvline(x=upper_bound, color='blue', linestyle='--', label=f'Upper bound (Outlier): {upper_bound:.2f}', linewidth=3)
            plt.title(f'Box plot of "pV" for class {classi} with outliers')
            plt.xlabel('log10 pV')
            plt.legend(fontsize=12)
            plt.tight_layout(rect=[0, 0.02, 1, 1])  # Adjust layout to move content up
            pdf.savefig()
            plt.close()

print(f"The PDF file with results and plots has been saved as '{pdf_filename}'.")

# Save the updated DataFrame with filled pV values
df.to_csv('03-Base-imputed223.csv', index=False)
print("Update complete. Missing 'pV' values have been imputed.")

## Final database (concatenated)

In [None]:
import pandas as pd
df1 = pd.read_csv('03-Base-imputedA.csv')  # Original spectra dataset
df2 = pd.read_csv('03-Base-imputed223.csv')  # Contains extra columns
extra_columns = ['pV', 'name', 'counts', 'class_bdm', 'class_asteroid_sf']
df2_extra = df2[extra_columns]
# Ensure both datasets have the same length before concatenation
if len(df1) == len(df2_extra):
    merged_df = pd.concat([df1, df2_extra], axis=1)  # Concatenate columns
    merged_df.to_csv('05-Base.csv', index=False)
    print("Merging complete. The new file '05-Base.csv' has been created.")
else:
    print("Error: The two datasets have different numbers of rows. Check for missing or extra data.")