In [1]:
pip install pandas scikit-learn openpyxl


Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from sklearn.impute import KNNImputer
import numpy as np

# Load the dataset
file_path = r"C:\Users\29200\Downloads\Dataset\Cobalt_Data.xlsx"
data = pd.read_excel(file_path)

# Inspect the column names
print("Columns in the dataset:", data.columns.tolist())

# Define the features and targets
features = ['Conc.', 'Cell Viability 24', 'Cell Viability 48', 'Cell Viability 72', 'Cell Viability 96', 'ALP 7', 'ALP 14', 'Surface area m2/g', 'Pore volume cm3/g', 'Pore size nm', 'VEGF']
targets = ['SiO2', 'B2O3', 'CaO', 'Na2O', 'P2O5', 'Co', 'CoO']

# Check if all specified columns exist in the dataset
missing_columns = [col for col in features + targets if col not in data.columns]
if missing_columns:
    print(f"Columns not found in the dataset: {missing_columns}")
else:
    # Filter the dataset to include only the selected features and target variables
    data_filtered = data[features + targets]

    # Initialize KNN imputer
    imputer = KNNImputer(n_neighbors=5)

    # Impute missing values
    data_imputed = imputer.fit_transform(data_filtered)

    # Convert the imputed data back to a DataFrame
    data_imputed = pd.DataFrame(data_imputed, columns=features + targets)

    # Ensure no negative values by replacing them with zero
    data_imputed[data_imputed < 0] = 0

    # Save the imputed dataset to a new Excel file
    data_imputed.to_excel('Cobalt_Data_Imputed.xlsx', index=False)

    print("Imputation completed and saved to 'Cobalt_Data_Imputed.xlsx'")


Columns in the dataset: ['Unnamed: 0', 'RESEARCH PAPER/ ARTICLE ', 'Class', 'SiO2  ', 'B2O3', 'CaO', 'Na2O', 'P2O5', 'K2O', 'MgO', 'Ce', 'Ce2O3', 'CeO2', 'Co', 'CoO', 'Unnamed: 15', 'Conc.', 'Cell Viability 24', 'Cell Viability 48', 'Cell Viability 72', 'Cell Viability 96', 'ALP 7', 'ALP 14', 'Surface area m2/g', 'Pore volume cm3/g', 'Pore size nm', 'Pore to pore distance nm', 'Wall thickness nm', 'VEGF']
Columns not found in the dataset: ['SiO2']


In [10]:
# Check for columns with all missing values and drop them
cols_with_all_missing = [col for col in data_filtered.columns if data_filtered[col].isna().all()]
if cols_with_all_missing:
    print(f"Columns with all missing values: {cols_with_all_missing}")
    data_filtered = data_filtered.drop(columns=cols_with_all_missing)

# Print the shape of the filtered data after dropping columns with all missing values
print("Shape of the filtered dataset after dropping columns with all missing values:", data_filtered.shape)

# Continue with the imputation and saving steps as before
# Initialize KNN imputer
imputer = KNNImputer(n_neighbors=5)

# Impute missing values
data_imputed = imputer.fit_transform(data_filtered)

# Convert the imputed data back to a DataFrame
data_imputed = pd.DataFrame(data_imputed, columns=data_filtered.columns)

# Ensure no negative values by replacing them with zero
data_imputed[data_imputed < 0] = 0

# Save the imputed dataset to a new Excel file
data_imputed.to_excel('Cobalt_Data_Imputed.xlsx', index=False)

print("Imputation completed and saved to 'Cobalt_Data_Imputed.xlsx'")


Shape of the filtered dataset after dropping columns with all missing values: (44, 17)
Imputation completed and saved to 'Cobalt_Data_Imputed.xlsx'


In [11]:
import os

# Print the current working directory
print("Current working directory:", os.getcwd())

# Save the imputed dataset to a new Excel file
output_file_path = os.path.join(os.getcwd(), 'Cobalt_Data_Imputed.xlsx')
data_imputed.to_excel(output_file_path, index=False)

print(f"Imputation completed and saved to '{output_file_path}'")


Current working directory: C:\Users\29200
Imputation completed and saved to 'C:\Users\29200\Cobalt_Data_Imputed.xlsx'


In [13]:
!pip install SMOTERegressor

ERROR: Could not find a version that satisfies the requirement SMOTERegressor (from versions: none)
ERROR: No matching distribution found for SMOTERegressor


In [14]:
!pip install imbalanced-learn




In [28]:
import pandas as pd
import numpy as np

from sklearn.utils import resample

# Load the dataset
file_path = r"C:\Users\29200\Downloads\Dataset\Cobalt_Data_Imputed.xlsx"
data = pd.read_excel(file_path)

# Separate features and target variables
features = data[['Conc.', 'Cell Viability 24', 'Cell Viability 48', 'Cell Viability 72', 'ALP 7', 'ALP 14', 
        'VEGF','Surface area m2/g', 'Pore volume cm3/g', 'Pore size nm']]
targets = data[['SiO2', 'B2O3', 'CaO', 'Na2O', 'P2O5', 'Co', 'CoO']]

# Random Oversampling
X_resampled, y_resampled = resample(features, targets, replace=True, n_samples=len(features) * 3, random_state=42)

# Parameters for Gaussian noise
noise_level = 0.1  # Adjust the noise level as needed
n_samples_to_generate = 50  # Number of new samples to generate

# Generate synthetic samples by adding Gaussian noise
X_resampled_noise = np.vstack([features.values] + [features.values + noise_level * np.random.normal(size=features.values.shape) for _ in range(n_samples_to_generate)])
y_resampled_noise = np.vstack([targets.values] + [targets.values + noise_level * np.random.normal(size=targets.values.shape) for _ in range(n_samples_to_generate)])

# Convert resampled data to DataFrame
features_resampled = pd.DataFrame(X_resampled, columns=features.columns)
targets_resampled = pd.DataFrame(y_resampled, columns=targets.columns)

features_noise = pd.DataFrame(X_resampled_noise, columns=features.columns)
targets_noise = pd.DataFrame(y_resampled_noise, columns=targets.columns)

# Combine features and targets
oversampled_resampled = pd.concat([features_resampled, targets_resampled], axis=1)
oversampled_noise = pd.concat([features_noise, targets_noise], axis=1)

# Save to Excel files
oversampled_resampled.to_excel('C:\\Users\\29200\\Downloads\\Dataset\\Co_oversampled_resampled.xlsx', index=False)
oversampled_noise.to_excel('C:\\Users\\29200\\Downloads\\Dataset\\Co_oversampled_noise.xlsx', index=False)

In [None]:
#Oversampled Data + before/after Visualisation
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import numpy as np

# Load the dataset
file_path = r"C:\Users\29200\Downloads\Dataset\dropkarungaColumn.xlsx"
data = pd.read_excel(file_path)

# Drop columns with too many NaNs and non-numerical columns
data_cleaned = data.drop(columns=['RESEARCH PAPER/ ARTICLE', 'VEGF'])

# Encode the 'Class' column
label_encoder = LabelEncoder()
data_cleaned['Class'] = label_encoder.fit_transform(data_cleaned['Class'])

# Separate features and target variables
#features = data_cleaned.drop(columns=['Cell Viability 24', 'Cell Viability 48', 'Cell Viability 72', 'Cell Viability 96', 'Cell Viability 120', 'ALP 7', 'ALP 14', 'ALP 21'])
#targets = data_cleaned[['Cell Viability 24', 'Cell Viability 48', 'Cell Viability 72', 'Cell Viability 96', 'Cell Viability 120', 'ALP 7', 'ALP 14', 'ALP 21']]

# Separate features and target variables
features = data_cleaned[['Conc.', 'Cell Viability 24', 'Cell Viability 48', 'Cell Viability 72', 
                          'ALP 7', 'ALP 14',
                         'Surface area m2/g', 'Pore volume cm3/g', 'Pore size nm']]
targets = data_cleaned[['SiO2', 'B2O3', 'CaO', 'Na2O', 'P2O5', 'Co', 'CoO']]



# Random Oversampling
X_resampled, y_resampled = resample(features, targets, replace=True, n_samples=len(features) * 2, random_state=42)

# Parameters for Gaussian noise
noise_level = 0.1  # Adjust the noise level as needed
n_samples_to_generate = 500  # Number of new samples to generate

# Generate synthetic samples by adding Gaussian noise
X_resampled_noise = np.vstack([features.values] + [features.values + noise_level * np.random.normal(size=features.values.shape) for _ in range(n_samples_to_generate)])
y_resampled_noise = np.vstack([targets.values] + [targets.values + noise_level * np.random.normal(size=targets.values.shape) for _ in range(n_samples_to_generate)])

# Convert resampled data to DataFrame
features_resampled = pd.DataFrame(X_resampled, columns=features.columns)
targets_resampled = pd.DataFrame(y_resampled, columns=targets.columns)

features_noise = pd.DataFrame(X_resampled_noise, columns=features.columns)
targets_noise = pd.DataFrame(y_resampled_noise, columns=targets.columns)

# Function to plot distributions of the target variables
def plot_distributions(targets_before, targets_after, title):
    fig, axes = plt.subplots(nrows=targets_before.shape[1], ncols=2, figsize=(15, 20))
    for i, column in enumerate(targets_before.columns):
        axes[i, 0].hist(targets_before[column], bins=20, alpha=0.7, label='Before')
        axes[i, 0].set_title(f'{column} Before Oversampling')
        
        axes[i, 1].hist(targets_after[column], bins=20, alpha=0.7, label='After', color='orange')
        axes[i, 1].set_title(f'{column} After Oversampling')

    fig.suptitle(title)
    plt.tight_layout()
    plt.show()

# Plot the distributions
plot_distributions(targets, targets_resampled, "Distributions Before vs After Random Oversampling")
plot_distributions(targets, targets_noise, "Distributions Before vs After Gaussian Noise Addition")

# Function to compare summary statistics
def compare_statistics(targets_before, targets_after, title):
    stats_before = targets_before.describe()
    stats_after = targets_after.describe()

    comparison = pd.DataFrame({
        'Before Mean': stats_before.loc['mean'],
        'After Mean': stats_after.loc['mean'],
        'Before Std': stats_before.loc['std'],
        'After Std': stats_after.loc['std'],
        'Before Median': targets_before.median(),
        'After Median': targets_after.median(),
    })

    print(title)
    print(comparison)

# Compare statistics
compare_statistics(targets, targets_resampled, "Summary Statistics Before vs After Random Oversampling")
compare_statistics(targets, targets_noise, "Summary Statistics Before vs After Gaussian Noise Addition")
