<a href="https://colab.research.google.com/github/21zaimotman-tech/Drug-Sensitivity-Prediction-to-Treat-Breast-Cancer/blob/main/Drug_Sensitivity_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install category_encoders xgboost lightgbm lazypredict


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from scipy import stats
from scipy.stats import skew, boxcox
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import TargetEncoder
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")


Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting lazypredict
  Downloading lazypredict-0.2.13-py2.py3-none-any.whl.metadata (12 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict, category_encoders
Successfully installed category_encoders-2.8.1 lazypredict-0.2.13


In [None]:
gdsc_data = pd.read_csv('aqidaily2024.csv')


In [None]:
gdsc_data.columns

In [None]:
breast_cancer_data = gdsc_data[
    (gdsc_data['TCGA_DESC'].str.contains('BRCA', na=False)) |
    (gdsc_data['Cancer Type (matching TCGA label)'].str.contains('BRCA', na=False))
]

In [None]:
breast_cancer_data = breast_cancer_data.drop_duplicates()

In [None]:
print(breast_cancer_data.columns)


In [None]:
# Plot distribution for Z_SCORE
plt.figure(figsize=(8, 6))
sns.histplot(breast_cancer_data['Z_SCORE'], kde=True, color="green", bins=30)
plt.title("Distribution of Z_SCORE")
plt.xlabel("Z_SCORE")
plt.ylabel("Frequency")
plt.show()


In [None]:
breast_cancer_data['AUC'].describe()

In [None]:
missing_values = breast_cancer_data.isnull().sum()
print(missing_values)
sns.heatmap(breast_cancer_data.isnull(), cbar=False, cmap='viridis')
plt.show()

In [None]:
imputer = KNNImputer(n_neighbors=5)
numeric_columns = breast_cancer_data.select_dtypes(include=['float64', 'int64']).columns
breast_cancer_data[numeric_columns] = imputer.fit_transform(breast_cancer_data[numeric_columns])

In [None]:
breast_cancer_data['Microsatellite instability Status (MSI)'].fillna('Unknown', inplace=True)
breast_cancer_data['TARGET'].fillna('Unknown', inplace=True)


In [None]:
missing_values = breast_cancer_data.isnull().sum()
print(missing_values)
sns.heatmap(breast_cancer_data.isnull(), cbar=False, cmap='viridis')
plt.show()

In [None]:

for col in ['CELL_LINE_NAME', 'TCGA_DESC', 'DRUG_NAME', 'GDSC Tissue descriptor 1',
            'GDSC Tissue descriptor 2', 'Cancer Type (matching TCGA label)',
            'Microsatellite instability Status (MSI)', 'Screen Medium',
            'Growth Properties', 'CNA', 'Gene Expression', 'Methylation',
            'TARGET', 'TARGET_PATHWAY']:
    breast_cancer_data[col] = LabelEncoder().fit_transform(breast_cancer_data[col])

In [None]:
corr_matrix = breast_cancer_data.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:
non_numeric_cols = breast_cancer_data.select_dtypes(exclude=['float64', 'int64']).columns
print("Non-numeric columns:", non_numeric_cols)

In [None]:
scaler = MinMaxScaler()
breast_cancer_data[numeric_columns] = scaler.fit_transform(breast_cancer_data[numeric_columns])

In [None]:
sns.boxplot(x="Microsatellite instability Status (MSI)", y="AUC", data=breast_cancer_data)
plt.title("Impact of MSI Status on AUC")
plt.show()


In [None]:
breast_cancer_data = breast_cancer_data.drop(columns=['LN_IC50'])


print("Updated dataset shape:", breast_cancer_data.shape)


In [None]:
X = breast_cancer_data.drop(columns=['AUC'])
y = breast_cancer_data['AUC']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape)
print(X_test.shape)


In [None]:
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

lazy_regressor = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)

models, predictions = lazy_regressor.fit(X_train, X_test, y_train, y_test)

print(models)


In [None]:

xgb = XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=6)


xgb.fit(X_train, y_train)

y_pred_train = xgb.predict(X_train)
y_pred_test = xgb.predict(X_test)

print("XGBoost - Training Performance:")
print("R^2 Score:", r2_score(y_train, y_pred_train))
print("RMSE:", mean_squared_error(y_train, y_pred_train, squared=False))

print("\nXGBoost - Testing Performance:")
print("R^2 Score:", r2_score(y_test, y_pred_test))
print("RMSE:", mean_squared_error(y_test, y_pred_test, squared=False))


In [None]:
print("Number of features in X_train:", X_train.shape[1])
print("Number of features in feature importance:", len(xgb.feature_importances_))


In [None]:
# Get feature importance and ensure it aligns with the features in X_train
feature_importances = xgb.feature_importances_
sorted_idx = feature_importances.argsort()

# Ensure X_train.columns matches the order of features used by the model
columns = X_train.columns[:len(feature_importances)]

plt.figure(figsize=(12, 6))
plt.barh(columns[sorted_idx], feature_importances[sorted_idx], color="skyblue")
plt.xlabel("Feature Importance")
plt.title("XGBoost Feature Importance")
plt.tight_layout()
plt.show()


In [None]:
low_importance_features = ['GDSC Tissue descriptor 1', 'CNA', 'TCGA_DESC']  # Example
X_train = X_train.drop(columns=low_importance_features)
X_test = X_test.drop(columns=low_importance_features)


In [None]:
y_pred_train = xgb.predict(X_train)
print("Training Performance:")
print("R^2 Score:", r2_score(y_train, y_pred_train))
print("RMSE:", mean_squared_error(y_train, y_pred_train, squared=False))


y_pred_test = xgb.predict(X_test)
print("\nTesting Performance:")
print("R^2 Score:", r2_score(y_test, y_pred_test))
print("RMSE:", mean_squared_error(y_test, y_pred_test, squared=False))


In [None]:
trained_features = xgb.get_booster().feature_names
print("Features used in the model:", trained_features)

In [None]:

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}


grid_search = GridSearchCV(XGBRegressor(random_state=42), param_grid, scoring='r2', cv=3)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


best_xgb = grid_search.best_estimator_


In [None]:

optimized_xgb = XGBRegressor(
    random_state=42,
    learning_rate=0.1,
    max_depth=5,
    n_estimators=300
)


optimized_xgb.fit(X_train, y_train)

y_pred_train = optimized_xgb.predict(X_train)
y_pred_test = optimized_xgb.predict(X_test)


In [None]:

print("Optimized XGBoost - Training Performance:")
print("R^2 Score:", r2_score(y_train, y_pred_train))
print("RMSE:", mean_squared_error(y_train, y_pred_train, squared=False))


print("\nOptimized XGBoost - Testing Performance:")
print("R^2 Score:", r2_score(y_test, y_pred_test))
print("RMSE:", mean_squared_error(y_test, y_pred_test, squared=False))


In [None]:

xgb_importance = optimized_xgb.feature_importances_
sorted_idx = np.argsort(xgb_importance)[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(len(xgb_importance)), xgb_importance[sorted_idx])
plt.xticks(range(len(xgb_importance)), X_train.columns[sorted_idx], rotation=90)
plt.title("Optimized XGBoost Feature Importance")
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(y_train, kde=True, bins=30)
plt.title('Distribution of AUC')
plt.xlabel('AUC')
plt.ylabel('Frequency')
plt.show()


In [None]:
sns.boxplot(data=X_train[['Z_SCORE', 'TARGET']])
plt.title('Feature Distributions')
plt.show()


In [None]:
# Example distribution of TARGET_PATHWAY before and after encoding
plt.figure(figsize=(10, 6))
sns.countplot(y=breast_cancer_data['TARGET_PATHWAY'], order=breast_cancer_data['TARGET_PATHWAY'].value_counts().index)
plt.title("Distribution of Encoded TARGET_PATHWAY")
plt.xlabel("Count")
plt.ylabel("Categories")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.boxplot(data=X_train, y='Z_SCORE', color='blue')
plt.title('Z_SCORE Distribution')

plt.subplot(1, 2, 2)
sns.boxplot(data=X_train, y='TARGET', color='orange')
plt.title('TARGET Distribution')

plt.tight_layout()
plt.show()


In [None]:
corr_matrix = X_train.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, optimized_xgb.predict(X_test), alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Perfect prediction line
plt.title('Predicted vs Actual AUC')
plt.xlabel('Actual AUC')
plt.ylabel('Predicted AUC')
plt.show()


In [None]:
residuals = y_test - optimized_xgb.predict(X_test)
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, bins=30)
plt.title('Residual Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()


In [None]:
results = grid_search.cv_results_
plt.figure(figsize=(8, 6))
plt.plot(results['param_n_estimators'], results['mean_test_score'], marker='o')
plt.title('Effect of n_estimators on R²')
plt.xlabel('n_estimators')
plt.ylabel('R²')
plt.show()


In [None]:
import pandas as pd
import ace_tools as tools

# Define the data for the table
data = {
    "Column": [
        "COSMIC_ID", "CELL_LINE_NAME", "TCGA_DESC", "DRUG_ID", "DRUG_NAME",
        "AUC", "Z_SCORE", "GDSC Tissue descriptor 1/2", "MSI", "CNA",
        "Gene Expression", "Methylation", "TARGET", "TARGET_PATHWAY"
    ],
    "Description": [
        "Links cell lines across datasets. Used for data organization but not directly included in modeling.",
        "Descriptive identifier for each cell line (e.g., 'HCC1954').",
        "Specifies tissue type, confirming breast cancer data in all rows.",
        "Unique identifier for drugs, essential for grouping drug response data.",
        "Identifies specific drugs, useful for reporting and grouping.",
        "Primary target variable measuring drug sensitivity.",
        "Secondary target variable for drug response standardization.",
        "Contextual information on tissue types (redundant after filtering for breast cancer).",
        "Microsatellite instability status, which could affect drug sensitivity.",
        "Binary genomic feature indicating copy number alterations.",
        "Binary feature representing gene expression data availability.",
        "Binary feature indicating methylation data presence.",
        "Molecular drug target, critical for linking drug mechanisms to cellular pathways.",
        "Pathway associated with drug targets, providing biological insights."
    ]
}

# Create the DataFrame
df = pd.DataFrame(data)

# Display the table to the user
tools.display_dataframe_to_user(name="Key Dataset Columns and Descriptions", dataframe=df)
