Compile a list of differentially expressed genes (DEGs) from your GEO2R dataset

In [None]:
import os
import pandas as pd

# Folder containing GEO2R files
input_folder = "path/to/filename/datasets for aflatoxinb1"  # Replace with the actual folder path containing GEO2R files
output_file = "path/to/processed_/Compiled_GEO2R.csv"  # Replace with the desired output file path

# Define the required columns
required_columns = ['ID', 'adj.P.Val', 'P.Value','Gene.symbol']

# Initialize an empty DataFrame to store compiled data
compiled_data = pd.DataFrame(columns=required_columns)

# Iterate through all files in the folder
for filename in os.listdir(input_folder):
    if filename.endswith('.tsv'):  # Process only TSV files
        file_path = os.path.join(input_folder, filename)
        
        # Read the GEO2R file
        try:
            data = pd.read_csv(file_path, sep='\t')
            
            # Select only the required columns (if they exist in the file)
            data = data[required_columns]
            
            # Append the data to the compiled DataFrame
            compiled_data = pd.concat([compiled_data, data], ignore_index=True)
        
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

# Save the compiled data to a CSV file
compiled_data.to_csv(output_file, index=False)

print(f"Compiled data saved to: {output_file}")


In [None]:
# Load required libraries
import pandas as pd

# Load the CTD dataset
file_path = 'path/to/CTD_D016604_diseases_20241212062811.tsv'
ctd_data = pd.read_csv(file_path, sep='\t')

# Step 1: Split the 'Inference Network' column into individual gene symbols
# Explode the gene list into separate rows
ctd_data['Inference Network'] = ctd_data['Inference Network'].str.split('|')
exploded_ctd_data = ctd_data.explode('Inference Network')

# Step 2: Standardize gene symbols
# Assuming GEO2R uses uppercase symbols, convert all gene symbols to uppercase
exploded_ctd_data['Inference Network'] = exploded_ctd_data['Inference Network'].str.upper()

# Step 3: Filter relevant columns for further analysis
# Keeping only necessary columns
filtered_ctd_data = exploded_ctd_data[['Chemical Name', 'Disease Name', 'Inference Network', 'Inference Score']]

# Step 4: Normalize the 'Inference Score'
# Min-Max scaling of the 'Inference Score'
min_score = filtered_ctd_data['Inference Score'].min()
max_score = filtered_ctd_data['Inference Score'].max()
filtered_ctd_data['Normalized Inference Score'] = (filtered_ctd_data['Inference Score'] - min_score) / (max_score - min_score)

# Step 5: Save the preprocessed data for comparison with GEO2R
output_file_path = '/Preprocessed_CTD_Data.csv'
filtered_ctd_data.to_csv(output_file_path, index=False)

print(f"Preprocessed data saved to: {output_file_path}")



In [None]:
filtered_ctd_data.loc[:, 'Normalized Inference Score'] = (
    filtered_ctd_data['Inference Score'] - min_score
) / (max_score - min_score)

In [None]:
import pandas as pd

# Replace with the actual path to your preprocessed CTD data
ctd_file_path = "path/to/Preprocessed_CTD_Data1.csv"
ctd_data = pd.read_csv(ctd_file_path)

print("CTD data loaded successfully.")


In [None]:
# Replace with the path to your GEO2R DEGs file
geo2r_file_path = "path/to/Compiled_GEO2R.csv"
geo2r_data = pd.read_csv(geo2r_file_path)

print("GEO2R DEGs data loaded successfully.")


In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = "path/to/Preprocessed_CTD_Data.csv"  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Rename the column
data.rename(columns={"Inference Network": "Gene.symbol"}, inplace=True)

# Save the updated DataFrame back to the CSV
output_file_path ="path/to/Preprocessed_CTD_Data1.csv"  # Replace with the desired output path
data.to_csv(output_file_path, index=False)

print(f"Column renamed and data saved to: {output_file_path}")


In [None]:
import pandas as pd

# Step 1: Load GEO2R and CTD datasets
geo2r_path = "path/to/Compiled_GEO2R.csv"  # Replace with your GEO2R file path
ctd_path = "path/to/Preprocessed_CTD_Data1.csv"      # Replace with your CTD file path

geo2r_data = pd.read_csv(geo2r_path)
ctd_data = pd.read_csv(ctd_path)

# Step 2: Inspect the datasets (Optional)
print("GEO2R Dataset Head:")
print(geo2r_data.head())
print("\nCTD Dataset Head:")
print(ctd_data.head())

# Step 3: Rename relevant columns for consistency
ctd_data.rename(columns={"Inference Network": "Gene.symbol"}, inplace=True)

# Step 4: Merge datasets (Outer Join on 'Gene.symbol')
merged_data = pd.merge(geo2r_data, ctd_data, on="Gene.symbol", how="outer")

# Step 5: Save merged dataset
output_file_path = "path_to_output_merged_file.csv"  # Replace with your desired output path
merged_data.to_csv(output_file_path, index=False)

print(f"Merged data saved to: {output_file_path}")

# Step 6: Analyze the merged dataset
# Example Analysis: Check for overlapping genes
geo2r_genes = set(geo2r_data['Gene.symbol'].dropna())
ctd_genes = set(ctd_data['Gene.symbol'].dropna())

overlapping_genes = geo2r_genes.intersection(ctd_genes)
unique_to_geo2r = geo2r_genes - ctd_genes
unique_to_ctd = ctd_genes - geo2r_genes

print(f"\nNumber of overlapping genes: {len(overlapping_genes)}")
print(f"Number of unique genes in GEO2R: {len(unique_to_geo2r)}")
print(f"Number of unique genes in CTD: {len(unique_to_ctd)}")

# Example Visualization (Optional, Requires matplotlib and seaborn)
import matplotlib.pyplot as plt
import seaborn as sns

# Plot overlapping genes
venn_labels = {
    "unique_to_geo2r": len(unique_to_geo2r),
    "unique_to_ctd": len(unique_to_ctd),
    "overlapping_genes": len(overlapping_genes),
}

plt.figure(figsize=(6, 6))
sns.set(style="whitegrid")
plt.bar(venn_labels.keys(), venn_labels.values(), color=['blue', 'orange', 'green'])
plt.xlabel("Gene Overlap Categories")
plt.ylabel("Number of Genes")
plt.title("Overlap Between GEO2R and CTD Datasets")
plt.show()


code for combined datasets CTD data 1 and CTD data 2 

In [None]:
import pandas as pd

# Load the datasets
file_1_path = 'path/to/cleaned_data.csv'  # Update to the actual path
file_2_path = 'path/to/CTD_D016604_ixns_20241212063652.csv'  # Update to the actual path

data_1 = pd.read_csv(file_1_path)
data_2 = pd.read_csv(file_2_path)

# Ensure column names match for merging
data_2.rename(columns={'Gene Symbol': 'Gene.symbol', 'Chemical Name': 'Chemical Name'}, inplace=True)

# Perform an inner join on common columns: 'Gene.symbol' and 'Chemical Name'
combined_data = pd.merge(data_1, data_2, on=['Gene.symbol', 'Chemical Name'], how='inner')

# Save the combined dataset to a file
combined_file_path = 'path/to/combined_dataafl.csv'  # Update to the desired save path
combined_data.to_csv(combined_file_path, index=False)

# Display some details about the combined dataset
print(f"Combined data has {combined_data.shape[0]} rows and {combined_data.shape[1]} columns.")
print(f"Combined dataset saved to: {combined_file_path}")
print(combined_data.head())


In [None]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

# Identify categorical and numerical columns
categorical_columns = ['Chemical Name', 'Gene.symbol', 'Disease Name', 'Interaction Actions']
numerical_columns = ['adj.P.Val', 'P.Value', 'Inference Score', 'Normalized Inference Score', 'Reference Count', 'Organism Count']

# Initialize sparse one-hot encoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)

# Fit and transform the categorical columns
encoded_sparse = encoder.fit_transform(data[categorical_columns])

# Combine sparse encoded features with numerical features
X_sparse = hstack([encoded_sparse, data[numerical_columns]])

# Convert the target variable into numerical format
y = data['Toxicity_Level'].map({'Low': 0, 'Medium': 1, 'High': 2})

# Split the data into training, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_sparse, X_temp_sparse, y_train, y_temp = train_test_split(
    X_sparse, y, test_size=0.4, random_state=42, stratify=y)

X_val_sparse, X_test_sparse, y_val, y_test = train_test_split(
    X_temp_sparse, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Save the sparse datasets to files
import joblib
joblib.dump(X_train_sparse,'path/to/X_train_sparse.pkl')
joblib.dump(X_val_sparse,'path/to/X_val_sparse.pkl')
joblib.dump(X_test_sparse,'path/to/X_test_sparse.pkl')
joblib.dump(y_train,'path/to/y_train.pkl')
joblib.dump(y_val,'path/to/y_val.pkl')
joblib.dump(y_test,'path/to/y_test.pkl')

print("Sparse datasets saved successfully:")
print("- Training set: X_train_sparse.pkl, y_train.pkl")
print("- Validation set: X_val_sparse.pkl, y_val.pkl")
print("- Test set: X_test_sparse.pkl, y_test.pkl")


Logistic Regression model 

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load the sparse datasets
X_train_sparse = joblib.load('path/to/X_train_sparse.pkl')
X_val_sparse = joblib.load('path/to/X_val_sparse.pkl')
X_test_sparse = joblib.load('path/to/X_test_sparse.pkl')
y_train = joblib.load('path/to/y_train.pkl')
y_val = joblib.load('path/to/y_val.pkl')
y_test = joblib.load('path/to/y_test.pkl')


# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_sparse = imputer.fit_transform(X_train_sparse)
X_val_sparse = imputer.transform(X_val_sparse)
X_test_sparse = imputer.transform(X_test_sparse)

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_sparse, y_train)

# Evaluate on the validation set
y_val_pred = model.predict(X_val_sparse)

# Validation metrics
print("Validation Set Performance:")
print(classification_report(y_val, y_val_pred, target_names=['Low', 'Medium', 'High']))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# Evaluate on the test set
y_test_pred = model.predict(X_test_sparse)

print("\nTest Set Performance:")
print(classification_report(y_test, y_test_pred, target_names=['Low', 'Medium', 'High']))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

# Save the trained model
joblib.dump(model, 'path/to/logistic_regression_model_imputed.pkl')
print("Model saved successfully: logistic_regression_model_imputed.pkl")


In [None]:

import numpy as np
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X_train_sparse, y_train, cv=5, scoring='accuracy')

print("5-Fold Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))


k-NNeighbors Model 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load the sparse datasets
X_train_sparse = joblib.load('path/to/X_train_sparse.pkl')
X_val_sparse = joblib.load('path/to/X_val_sparse.pkl')
X_test_sparse = joblib.load('path/to/X_test_sparse.pkl')
y_train = joblib.load('path/to/y_train.pkl')
y_val = joblib.load('path/to/y_val.pkl')
y_test = joblib.load('path/to/y_test.pkl')


# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_sparse = imputer.fit_transform(X_train_sparse)
X_val_sparse = imputer.transform(X_val_sparse)
X_test_sparse = imputer.transform(X_test_sparse)
# Train k-NN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_sparse, y_train)

# Evaluate k-NN
y_val_pred_knn = knn_model.predict(X_val_sparse)
print("k-NN - Validation Performance")
print(classification_report(y_val, y_val_pred_knn))
print(confusion_matrix(y_val, y_val_pred_knn))


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {'n_neighbors': [3, 5, 7, 9]}

# Perform Grid Search
knn_grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3, scoring='accuracy')
knn_grid.fit(X_train_sparse, y_train)

# Best k-value
print(f"Best k-value: {knn_grid.best_params_}")


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize k-NN with the best k-value
knn_best = KNeighborsClassifier(n_neighbors=3)

# Train k-NN
knn_best.fit(X_train_sparse, y_train)

# Evaluate on the validation set
y_val_pred_knn_best = knn_best.predict(X_val_sparse)

# Print performance metrics
print("k-NN (k=3) - Validation Performance")
print(classification_report(y_val, y_val_pred_knn_best))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_knn_best))


In [None]:
import joblib

# Load the preprocessed test feature data (X_test_sparse)
X_test_sparse = joblib.load('path/to/X_test_sparse.pkl')

# Load the test target labels (y_test)
y_test = joblib.load('path/to/y_test.pkl')

print("Test data loaded successfully!")
print(f"X_test_sparse shape: {X_test_sparse.shape}")
print(f"y_test shape: {y_test.shape}")


In [None]:
from sklearn.impute import SimpleImputer
from scipy.sparse import csr_matrix

# Initialize the imputer (mean strategy)
imputer = SimpleImputer(strategy='mean')

# Impute missing values in sparse matrix
X_test_sparse_imputed = imputer.fit_transform(X_test_sparse)

# Ensure it's still in sparse format
X_test_sparse_imputed = csr_matrix(X_test_sparse_imputed)

print("Missing values imputed successfully!")


In [None]:
import numpy as np
from scipy.sparse import csr_matrix

# Convert sparse matrix to dense to handle NaNs
X_test_dense = X_test_sparse.toarray()

# Identify rows without NaNs
non_nan_indices = ~np.isnan(X_test_dense).any(axis=1)

# Filter out rows with NaNs and convert back to sparse format
X_test_sparse_cleaned = csr_matrix(X_test_dense[non_nan_indices])

# Ensure corresponding labels are also filtered
y_test_cleaned = y_test[non_nan_indices]

print(f"Rows with NaNs removed. Remaining rows: {X_test_sparse_cleaned.shape[0]}")


In [None]:
# Predict using cleaned data
y_test_pred_knn = knn_model.predict(X_test_sparse_cleaned)

# Evaluate performance
from sklearn.metrics import classification_report, confusion_matrix

print("k-NN (k=3) - Test Performance (After Removing NaNs)")
print(classification_report(y_test_cleaned, y_test_pred_knn))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_cleaned, y_test_pred_knn))


In [None]:
import joblib

# Save the optimized k-NN model
joblib.dump(knn_best,'path/to/knn_best_model.pkl')
print("Optimized k-NN model saved successfully: knn_best_model.pkl")


XGB MODEL

In [None]:
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load the sparse datasets
X_train_sparse = joblib.load('path/to/X_train_sparse.pkl')
X_val_sparse = joblib.load('path/to/X_val_sparse.pkl')
X_test_sparse = joblib.load('path/to/X_test_sparse.pkl')
y_train = joblib.load('path/to/y_train.pkl')
y_val = joblib.load('path/to/y_val.pkl')
y_test = joblib.load('path/to/y_test.pkl')


# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_sparse = imputer.fit_transform(X_train_sparse)
X_val_sparse = imputer.transform(X_val_sparse)
X_test_sparse = imputer.transform(X_test_sparse)

# Initialize and train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train_sparse, y_train)

# Evaluate XGBoost
y_val_pred_xgb = xgb_model.predict(X_val_sparse)

# Print results
print("XGBoost - Validation Performance")
print(classification_report(y_val, y_val_pred_xgb))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_xgb))


In [None]:
y_test_pred_xgb = xgb_model.predict(X_test_sparse)

print("XGBoost - Test Performance")
print(classification_report(y_test, y_test_pred_xgb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_xgb))


In [None]:
import joblib

# Save the trained XGBoost model
joblib.dump(xgb_model,'path/to/xgboost_best_model.pkl')

print("XGBoost model saved successfully: xgboost_best_model.pkl")


LGBM model 

In [None]:
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load the sparse datasets
X_train_sparse = joblib.load('path/to/X_train_sparse.pkl')
X_val_sparse = joblib.load('path/to/X_val_sparse.pkl')
X_test_sparse = joblib.load('path/to/X_test_sparse.pkl')
y_train = joblib.load('path/to/y_train.pkl')
y_val = joblib.load('path/to/y_val.pkl')
y_test = joblib.load('path/to/y_test.pkl')


# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_sparse = imputer.fit_transform(X_train_sparse)
X_val_sparse = imputer.transform(X_val_sparse)
X_test_sparse = imputer.transform(X_test_sparse)

# Initialize and train LightGBM
lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(X_train_sparse, y_train)

# Evaluate LightGBM
y_val_pred_lgbm = lgbm_model.predict(X_val_sparse)

# Print results
print("LightGBM - Validation Performance")
print(classification_report(y_val, y_val_pred_lgbm))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_lgbm))


In [None]:
y_test_pred_lgbm = lgbm_model.predict(X_test_sparse)

print("LightGBM - Test Performance")
print(classification_report(y_test, y_test_pred_lgbm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_lgbm))


In [None]:
import joblib

# Save the trained LightGBM model
joblib.dump(lgbm_model,'path/to/lightgbm_best_model.pkl')

print("LightGBM model saved successfully: lightgbm_best_model.pkl")


RandomForest Model 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load the sparse datasets
X_train_sparse = joblib.load('path/to/X_train_sparse.pkl')
X_val_sparse = joblib.load('path/to/X_val_sparse.pkl')
X_test_sparse = joblib.load('path/to/X_test_sparse.pkl')
y_train = joblib.load('path/to/y_train.pkl')
y_val = joblib.load('path/to/y_val.pkl')
y_test = joblib.load('path/to/y_test.pkl')


# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_sparse = imputer.fit_transform(X_train_sparse)
X_val_sparse = imputer.transform(X_val_sparse)
X_test_sparse = imputer.transform(X_test_sparse)
# Initialize and train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_sparse, y_train)

# Evaluate Random Forest
y_val_pred_rf = rf_model.predict(X_val_sparse)

# Print results
print("Random Forest - Validation Performance")
print(classification_report(y_val, y_val_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_rf))


In [None]:
y_test_pred_rf = rf_model.predict(X_test_sparse)

print("Random Forest - Test Performance")
print(classification_report(y_test, y_test_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_rf))


In [None]:
import joblib

# Save the trained Random Forest model
joblib.dump(rf_model,'path/to/random_forest_best_model.pkl')

print("Random Forest model saved successfully: random_forest_best_model.pkl")


CatBoost Model 

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load the sparse datasets
X_train_sparse = joblib.load('path/to/X_train_sparse.pkl')
X_val_sparse = joblib.load('path/to/X_val_sparse.pkl')
X_test_sparse = joblib.load('path/to/X_test_sparse.pkl')
y_train = joblib.load('path/to/y_train.pkl')
y_val = joblib.load('path/to/y_val.pkl')
y_test = joblib.load('path/to/y_test.pkl')


# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_sparse = imputer.fit_transform(X_train_sparse)
X_val_sparse = imputer.transform(X_val_sparse)
X_test_sparse = imputer.transform(X_test_sparse)

# Train CatBoost
cat_model = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, verbose=100, random_state=42)
cat_model.fit(X_train_sparse, y_train)

# Evaluate
y_val_pred_cat = cat_model.predict(X_val_sparse)

print("CatBoost - Validation Performance")
print(classification_report(y_val, y_val_pred_cat))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_cat))


In [None]:
y_test_pred_cat = cat_model.predict(X_test_sparse)

print("CatBoost - Test Performance")
print(classification_report(y_test, y_test_pred_cat))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_cat))


In [None]:
import joblib

# Save the trained CatBoost model
joblib.dump(cat_model,'path/to/catboost_best_model.pkl')

print("CatBoost model saved successfully: catboost_best_model.pkl")
