In [1]:
import pandas as pd
import numpy as np

def remove_empty_by_percent(data, remove_by_percent = 90):
    # Create a copy of the dataset to store the imputed values
    imputed_data = data.copy()  
    # Calculate the percentage of None values in each column
    none_percentage = (imputed_data.isna().sum() / len(imputed_data)) * 100

    # Determine columns with over 50% missing values
    needed_columns = none_percentage[none_percentage <= remove_by_percent]

    # get removed columns
    # Get the columns that are not in the excluded_columns list
    removed_columns = [col for col in imputed_data.columns if col not in needed_columns.index]

    # Drop the columns from the DataFrame
    filtered_data = imputed_data[needed_columns.index]

    return filtered_data, needed_columns.index, removed_columns


df = pd.read_csv("./New_dataframe.csv", low_memory=False)
not_needed_columns = [
    'Unnamed: 0', 'Unnamed: 0_x', 
    'Unnamed: 0.1', 'Unnamed: 0_y', 'Unnamed: 0_1',
    'Secondary Bibliogrpahies', 'Related Pdb Entries', 'rcsb_primary_citation_pdbx_database_id_pub_med', 
    'citation_pdbx_database_id_pub_med', 'rcsb_entry_container_identifiers_pubmed_id',
    "rcsb_accession_info_major_revision", "rcsb_accession_info_minor_revision",
    "rcsb_primary_citation_journal_id_csd", "rcsb_primary_citation_journal_volume",
    "rcsb_primary_citation_year", "symmetry_int_tables_number", "pdbx_nmr_representative_conformer_id",
    "citation_journal_id_csd", "citation_journal_volume", "citation_year", "diffrn_crystal_id",
    "diffrn_id", "diffrn_radiation_diffrn_id", "diffrn_radiation_wavelength_id", "exptl_crystal_id",
    "bibliography_year", "em_experiment_entity_assembly_id", "em_experiment_id", "diffrn_detector_diffrn_id", 
    "diffrn_source_diffrn_id", "exptl_crystal_grow_crystal_id", "pdbx_reflns_twin_crystal_id", "pdbx_reflns_twin_diffrn_id",
    "pdbx_reflns_twin_domain_id", "pdbx_sgproject_id", "em3d_fitting_id", "em3d_reconstruction_id", "em3d_reconstruction_image_processing_id",
    "em_ctf_correction_em_image_processing_id", "em_ctf_correction_id", "em_entity_assembly_id", "em_entity_assembly_parent_id",
    "em_image_recording_id", "em_image_recording_imaging_id", "em_imaging_id", "em_imaging_specimen_id", "em_particle_selection_id", 
    "em_particle_selection_image_processing_id", "em_single_particle_entity_id", "em_single_particle_entity_image_processing_id",
    "em_software_id", "em_software_image_processing_id", "em_specimen_experiment_id", "em_specimen_id", "em_vitrification_id", "em_vitrification_specimen_id",
    "pdbx_nmr_exptl_conditions_id", "pdbx_nmr_exptl_experiment_id", "pdbx_nmr_exptl_solution_id", "pdbx_nmr_exptl_spectrometer_id", 
    "pdbx_nmr_exptl_sample_conditions_conditions_id", "pdbx_nmr_sample_details_solution_id", "pdbx_nmr_spectrometer_spectrometer_id",
    "em3d_fitting_list_id", "em3d_fitting_list_3d_fitting_id", "em_helical_entity_id", "em_helical_entity_image_processing_id",
    "pdbx_initial_refinement_model_id", "em3d_crystal_entity_id", "em3d_crystal_entity_image_processing_id", "em_diffraction_id", 
    "em_diffraction_imaging_id", "em_diffraction_shell_em_diffraction_stats_id", "em_diffraction_shell_id", "em_diffraction_stats_id",
    "em_diffraction_stats_image_processing_id", "em_embedding_id", "em_embedding_specimen_id", "pdbx_serial_crystallography_sample_delivery_diffrn_id",
    "pdbx_serial_crystallography_sample_delivery_injection_diffrn_id", "pdbx_serial_crystallography_sample_delivery_fixed_target_diffrn_id",
    "pdbx_serial_crystallography_data_reduction_diffrn_id", "pdbx_serial_crystallography_measurement_diffrn_id", "em_staining_id", "em_staining_specimen_id",
    "em2d_crystal_entity_id", "em2d_crystal_entity_image_processing_id", "Unnamed: 0.2"
]
df = df.drop(not_needed_columns, inplace=False, axis=1)
myList =  df.select_dtypes(include=['number', 'int', 'float']).columns.tolist()
df = df[myList]
new_df, _, _ = remove_empty_by_percent(df, 30)

In [2]:
# Drop rows with missing values
from sklearn.svm import SVR
from fancyimpute import SoftImpute
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

def KNN_AL(df):
    data = df.copy()  
    # Create a KNNImputer instance
    imputer = KNNImputer(n_neighbors=2)  # You can adjust the number of neighbors as needed
    
    # Impute missing values
    imputed_data = imputer.fit_transform(data)
    imputed_data_ = pd.DataFrame(imputed_data, columns=data.columns)
    return imputed_data_


def soft_imputer_regressor(df):
    data = df.copy()  
    # Create a KNNImputer instance
    imputer = SoftImpute()  # You can adjust the number of neighbors as needed
    
    # Impute missing values
    imputed_data = imputer.fit_transform(data)
    imputed_data_ = pd.DataFrame(imputed_data, columns=data.columns)
    return imputed_data_

def iterative_imputer_regressor(df):
    data = df.copy()  
    imputer = IterativeImputer(max_iter=10, random_state=0)  # You can adjust the number of neighbors as needed
    
    # Impute missing values
    imputed_data = imputer.fit_transform(data)
    imputed_data_ = pd.DataFrame(imputed_data, columns=data.columns)
    return imputed_data_

def simple_regressor(df):
    data = df.copy()  
    imputer = SimpleImputer(strategy='mean', missing_values=np.NaN, keep_empty_features=True)  # You can adjust the number of neighbors as needed
    
    # Impute missing values
    imputed_data = imputer.fit_transform(data)
    imputed_data_ = pd.DataFrame(imputed_data, columns=data.columns)
    return imputed_data_

In [3]:
imputed_data_KNN = KNN_AL(df)
imputed_data_SIR = soft_imputer_regressor(df)
imputed_data_IIR = iterative_imputer_regressor(df)
imputed_data_SR = simple_regressor(df)

[SoftImpute] Max Singular Value of X_init = 157466026.912027
[SoftImpute] Iter 1: observed MAE=41.549037 rank=11
[SoftImpute] Iter 2: observed MAE=40.810950 rank=11
[SoftImpute] Iter 3: observed MAE=40.454756 rank=11
[SoftImpute] Iter 4: observed MAE=40.272637 rank=11
[SoftImpute] Iter 5: observed MAE=40.171378 rank=11
[SoftImpute] Iter 6: observed MAE=40.106544 rank=11
[SoftImpute] Iter 7: observed MAE=40.062444 rank=11
[SoftImpute] Iter 8: observed MAE=40.030198 rank=11
[SoftImpute] Iter 9: observed MAE=40.005059 rank=11
[SoftImpute] Iter 10: observed MAE=39.985078 rank=11
[SoftImpute] Iter 11: observed MAE=39.968883 rank=11
[SoftImpute] Iter 12: observed MAE=39.955204 rank=11
[SoftImpute] Iter 13: observed MAE=39.943483 rank=11
[SoftImpute] Iter 14: observed MAE=39.933626 rank=11
[SoftImpute] Iter 15: observed MAE=39.925514 rank=11
[SoftImpute] Iter 16: observed MAE=39.918669 rank=11
[SoftImpute] Iter 17: observed MAE=39.912739 rank=11
[SoftImpute] Iter 18: observed MAE=39.907552 ra

TypeError: _ArrayMemoryError.__init__() missing 1 required positional argument: 'dtype'

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt

import pandas as pd
import numpy as np

def remove_empty_by_percent(data, remove_by_percent = 90):
    # Create a copy of the dataset to store the imputed values
    imputed_data = data.copy()  
    # Calculate the percentage of None values in each column
    none_percentage = (imputed_data.isna().sum() / len(imputed_data)) * 100

    # Determine columns with over 50% missing values
    needed_columns = none_percentage[none_percentage <= remove_by_percent]

    # get removed columns
    # Get the columns that are not in the excluded_columns list
    removed_columns = [col for col in imputed_data.columns if col not in needed_columns.index]

    # Drop the columns from the DataFrame
    filtered_data = imputed_data[needed_columns.index]

    return filtered_data, needed_columns.index, removed_columns


df = pd.read_csv("./New_dataframe.csv", low_memory=False)
not_needed_columns = [
    'Unnamed: 0', 'Unnamed: 0_x', 
    'Unnamed: 0.1', 'Unnamed: 0_y', 'Unnamed: 0_1',
    'Secondary Bibliogrpahies', 'Related Pdb Entries', 'rcsb_primary_citation_pdbx_database_id_pub_med', 
    'citation_pdbx_database_id_pub_med', 'rcsb_entry_container_identifiers_pubmed_id',
    "rcsb_accession_info_major_revision", "rcsb_accession_info_minor_revision",
    "rcsb_primary_citation_journal_id_csd", "rcsb_primary_citation_journal_volume",
    "rcsb_primary_citation_year", "symmetry_int_tables_number", "pdbx_nmr_representative_conformer_id",
    "citation_journal_id_csd", "citation_journal_volume", "citation_year", "diffrn_crystal_id",
    "diffrn_id", "diffrn_radiation_diffrn_id", "diffrn_radiation_wavelength_id", "exptl_crystal_id",
    "bibliography_year", "em_experiment_entity_assembly_id", "em_experiment_id", "diffrn_detector_diffrn_id", 
    "diffrn_source_diffrn_id", "exptl_crystal_grow_crystal_id", "pdbx_reflns_twin_crystal_id", "pdbx_reflns_twin_diffrn_id",
    "pdbx_reflns_twin_domain_id", "pdbx_sgproject_id", "em3d_fitting_id", "em3d_reconstruction_id", "em3d_reconstruction_image_processing_id",
    "em_ctf_correction_em_image_processing_id", "em_ctf_correction_id", "em_entity_assembly_id", "em_entity_assembly_parent_id",
    "em_image_recording_id", "em_image_recording_imaging_id", "em_imaging_id", "em_imaging_specimen_id", "em_particle_selection_id", 
    "em_particle_selection_image_processing_id", "em_single_particle_entity_id", "em_single_particle_entity_image_processing_id",
    "em_software_id", "em_software_image_processing_id", "em_specimen_experiment_id", "em_specimen_id", "em_vitrification_id", "em_vitrification_specimen_id",
    "pdbx_nmr_exptl_conditions_id", "pdbx_nmr_exptl_experiment_id", "pdbx_nmr_exptl_solution_id", "pdbx_nmr_exptl_spectrometer_id", 
    "pdbx_nmr_exptl_sample_conditions_conditions_id", "pdbx_nmr_sample_details_solution_id", "pdbx_nmr_spectrometer_spectrometer_id",
    "em3d_fitting_list_id", "em3d_fitting_list_3d_fitting_id", "em_helical_entity_id", "em_helical_entity_image_processing_id",
    "pdbx_initial_refinement_model_id", "em3d_crystal_entity_id", "em3d_crystal_entity_image_processing_id", "em_diffraction_id", 
    "em_diffraction_imaging_id", "em_diffraction_shell_em_diffraction_stats_id", "em_diffraction_shell_id", "em_diffraction_stats_id",
    "em_diffraction_stats_image_processing_id", "em_embedding_id", "em_embedding_specimen_id", "pdbx_serial_crystallography_sample_delivery_diffrn_id",
    "pdbx_serial_crystallography_sample_delivery_injection_diffrn_id", "pdbx_serial_crystallography_sample_delivery_fixed_target_diffrn_id",
    "pdbx_serial_crystallography_data_reduction_diffrn_id", "pdbx_serial_crystallography_measurement_diffrn_id", "em_staining_id", "em_staining_specimen_id",
    "em2d_crystal_entity_id", "em2d_crystal_entity_image_processing_id", "Unnamed: 0.2"
]
df = df.drop(not_needed_columns, inplace=False, axis=1)
myList =  df.select_dtypes(include=['number', 'int', 'float']).columns.tolist()
df = df[myList]
new_df, _, _ = remove_empty_by_percent(df, 30)

# Calculate the percentage of missing values in each column
missing_data = (df.isnull().sum() / len(df)).reset_index()
missing_data.columns = ['Feature', 'Percentage']

# Create an Altair bar plot
bars = alt.Chart(missing_data).mark_bar().encode(
    x=alt.X('Percentage:Q', axis=alt.Axis(format='%'), title='Percentage of Missing Values'),
    y=alt.Y('Feature:N', title='Feature'),
    color=alt.value('lightgray')
).properties(
    title='Missing Data Summary'
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3
).encode(
    text=alt.Text('Percentage:Q', format='.0%')
)

plot = (bars + text).configure_title(fontSize=14)

# Save the Altair plot as a high-quality PNG image
plot.save('missing_data_summary.png', format='png', scale_factor=2.0)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [7]:
new_df

Unnamed: 0,rcsb_entry_info_assembly_count,rcsb_entry_info_branched_entity_count,rcsb_entry_info_cis_peptide_count,rcsb_entry_info_deposited_atom_count,rcsb_entry_info_deposited_hydrogen_atom_count,rcsb_entry_info_deposited_model_count,rcsb_entry_info_deposited_modeled_polymer_monomer_count,rcsb_entry_info_deposited_nonpolymer_entity_instance_count,rcsb_entry_info_deposited_polymer_entity_instance_count,rcsb_entry_info_deposited_polymer_monomer_count,...,exptl_crystal_grow_method_microbatch-under-oil,exptl_crystal_grow_method_microseeding,exptl_crystal_grow_method_mono-olein cubic phase,exptl_crystal_grow_method_pseudo-batch hanging drop,"exptl_crystal_grow_method_sitting drop, vapor diffusion",exptl_crystal_grow_method_vapour diffusion,exptl_crystal_grow_method_nan,em2d_crystal_entity_space_group_name_hm_P 2 21 21,em2d_crystal_entity_space_group_name_hm_P 4 21 2,em2d_crystal_entity_space_group_name_hm_nan
0,1,0,2,9092,0,1,1108,2,2,1108,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,1,2,9221,0,1,1102,10,2,1152,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1,1,2,9413,0,1,1105,9,2,1160,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,0,2,9521,0,1,1102,17,2,1160,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1,1,4594,0,1,551,4,1,551,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3130,1,0,3,23458,0,1,2277,112,33,2427,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3131,1,0,3,22480,0,1,2291,73,33,2310,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3132,1,0,6,46292,0,1,4534,201,66,4820,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3133,1,0,6,44980,0,1,4614,148,66,4966,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [11]:
new_df.head()

Unnamed: 0,rcsb_entry_info_assembly_count,rcsb_entry_info_branched_entity_count,rcsb_entry_info_cis_peptide_count,rcsb_entry_info_deposited_atom_count,rcsb_entry_info_deposited_hydrogen_atom_count,rcsb_entry_info_deposited_model_count,rcsb_entry_info_deposited_modeled_polymer_monomer_count,rcsb_entry_info_deposited_nonpolymer_entity_instance_count,rcsb_entry_info_deposited_polymer_entity_instance_count,rcsb_entry_info_deposited_polymer_monomer_count,...,exptl_crystal_grow_method_microbatch-under-oil,exptl_crystal_grow_method_microseeding,exptl_crystal_grow_method_mono-olein cubic phase,exptl_crystal_grow_method_pseudo-batch hanging drop,"exptl_crystal_grow_method_sitting drop, vapor diffusion",exptl_crystal_grow_method_vapour diffusion,exptl_crystal_grow_method_nan,em2d_crystal_entity_space_group_name_hm_P 2 21 21,em2d_crystal_entity_space_group_name_hm_P 4 21 2,em2d_crystal_entity_space_group_name_hm_nan
0,1,0,2,9092,0,1,1108,2,2,1108,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,1,2,9221,0,1,1102,10,2,1152,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1,1,2,9413,0,1,1105,9,2,1160,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,0,2,9521,0,1,1102,17,2,1160,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1,1,4594,0,1,551,4,1,551,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
