<a href="https://colab.research.google.com/github/BharathiPriyadarshini/MLproject/blob/main/TNBC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load dataset
file_path = "/content/METABRIC_RNA_Mutation.csv"  # Update if needed
df = pd.read_csv(file_path, low_memory=False)

# Display basic info
print("✅ Dataset Loaded Successfully!")
print("\n🔹 Dataset Shape:", df.shape)  # Number of rows and columns
print("\n🔹 First 5 Rows:\n", df.head())  # Preview data
print("\n🔹 Column Names:\n", df.columns)  # List column names
print("\n🔹 Data Types:\n", df.dtypes)  # Check data types
print("\n🔹 Missing Values Per Column:\n", df.isnull().sum().sort_values(ascending=False))  # Check missing values
print("\n🔹 Summary Statistics:\n", df.describe())

✅ Dataset Loaded Successfully!

🔹 Dataset Shape: (1904, 693)

🔹 First 5 Rows:
    patient_id  age_at_diagnosis type_of_breast_surgery    cancer_type  \
0           0             75.65             MASTECTOMY  Breast Cancer   
1           2             43.19      BREAST CONSERVING  Breast Cancer   
2           5             48.87             MASTECTOMY  Breast Cancer   
3           6             47.68             MASTECTOMY  Breast Cancer   
4           8             76.97             MASTECTOMY  Breast Cancer   

                        cancer_type_detailed cellularity  chemotherapy  \
0           Breast Invasive Ductal Carcinoma         NaN             0   
1           Breast Invasive Ductal Carcinoma        High             0   
2           Breast Invasive Ductal Carcinoma        High             1   
3  Breast Mixed Ductal and Lobular Carcinoma    Moderate             1   
4  Breast Mixed Ductal and Lobular Carcinoma        High             1   

  pam50_+_claudin-low_subtype  cohort

In [None]:
# Convert mutation columns to numeric (force errors to NaN)
mutation_cols = ['hras_mut', 'prps2_mut', 'smarcb1_mut', 'stmn2_mut', 'siah1_mut']
df[mutation_cols] = df[mutation_cols].apply(pd.to_numeric, errors='coerce')

# Fill missing values
df.fillna({"cellularity": "Unknown", "tumor_stage": "Unknown", "primary_tumor_laterality": "Unknown"}, inplace=True)

# Drop columns with excessive missing values (e.g., more than 30%)
missing_threshold = 0.3 * len(df)  # 30% of total rows
df.dropna(thresh=missing_threshold, axis=1, inplace=True)

# Display updated info
print("\n🔹 Updated Data Types:\n", df.dtypes)
print("\n🔹 Missing Values After Processing:\n", df.isnull().sum().sort_values(ascending=False))


🔹 Updated Data Types:
 patient_id                  int64
age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
                           ...   
hras_mut                  float64
prps2_mut                 float64
smarcb1_mut               float64
stmn2_mut                 float64
siah1_mut                 float64
Length: 693, dtype: object

🔹 Missing Values After Processing:
 3-gene_classifier_subtype    204
neoplasm_histologic_grade     72
mutation_count                45
er_status_measured_by_ihc     30
type_of_breast_surgery        22
                            ... 
mmp16                          0
mmp17                          0
mmp19                          0
mmp2                           0
mmp1                           0
Length: 693, dtype: int64


In [None]:
# Convert mutation columns from float64 to int64
mutation_cols = ['hras_mut', 'prps2_mut', 'smarcb1_mut', 'stmn2_mut', 'siah1_mut']
df[mutation_cols] = df[mutation_cols].astype('Int64')  # Keeps NaN support

# Handle missing values
df['neoplasm_histologic_grade'].fillna(df['neoplasm_histologic_grade'].mode()[0], inplace=True)
df['mutation_count'].fillna(df['mutation_count'].median(), inplace=True)
df['er_status_measured_by_ihc'].fillna(df['er_status_measured_by_ihc'].mode()[0], inplace=True)
df['type_of_breast_surgery'].fillna(df['type_of_breast_surgery'].mode()[0], inplace=True)

# Drop '3-gene_classifier_subtype' if it's not needed
df.drop(columns=['3-gene_classifier_subtype'], inplace=True)

# Display final missing values summary
print("\n🔹 Final Missing Values:\n", df.isnull().sum().sort_values(ascending=False))


🔹 Final Missing Values:
 tumor_size                        20
oncotree_code                     15
cancer_type_detailed              15
tumor_other_histologic_subtype    15
smarcb1_mut                        2
                                  ..
mmp11                              0
mmp12                              0
mmp13                              0
mmp14                              0
mapk7                              0
Length: 692, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['neoplasm_histologic_grade'].fillna(df['neoplasm_histologic_grade'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['mutation_count'].fillna(df['mutation_count'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method wil

In [None]:
# Convert mutation columns from float64 to int64
mutation_cols = ['hras_mut', 'prps2_mut', 'smarcb1_mut', 'stmn2_mut', 'siah1_mut']
df[mutation_cols] = df[mutation_cols].astype('Int64')  # Keeps NaN support

# Fix missing values properly
df['neoplasm_histologic_grade'] = df['neoplasm_histologic_grade'].fillna(df['neoplasm_histologic_grade'].mode()[0])
df['mutation_count'] = df['mutation_count'].fillna(df['mutation_count'].median())
df['er_status_measured_by_ihc'] = df['er_status_measured_by_ihc'].fillna(df['er_status_measured_by_ihc'].mode()[0])
df['type_of_breast_surgery'] = df['type_of_breast_surgery'].fillna(df['type_of_breast_surgery'].mode()[0])

# Handle remaining missing values
for col in ['tumor_size', 'tumor_other_histologic_subtype', 'cancer_type_detailed', 'oncotree_code']:
    df[col] = df[col].fillna(df[col].mode()[0])  # Fill with most common value

# Display final missing values summary
print("\n✅ Final Missing Values:\n", df.isnull().sum().sort_values(ascending=False))



✅ Final Missing Values:
 stmn2_mut            2
smarcb1_mut          2
prps2_mut            2
hras_mut             2
death_from_cancer    1
                    ..
mmp1                 0
mmp10                0
mmp11                0
mmp12                0
mapk4                0
Length: 692, dtype: int64


In [None]:
# Fill remaining mutation columns with 0 (assuming missing means no mutation)
mutation_cols = ['stmn2_mut', 'smarcb1_mut', 'prps2_mut', 'hras_mut', 'siah1_mut']
df[mutation_cols] = df[mutation_cols].fillna(0).astype('Int64')  # Convert to integer

# Final check
print("\n✅ Final Missing Values:\n", df.isnull().sum().sort_values(ascending=False))


✅ Final Missing Values:
 death_from_cancer    1
usp9x                0
utrn                 0
zfp36l1              0
ackr3                0
                    ..
mmp1                 0
mmp10                0
mmp11                0
mmp12                0
mapk4                0
Length: 692, dtype: int64


In [None]:
print("\n🔹 Unique values in target column:\n", df["pam50_+_claudin-low_subtype"].value_counts())



🔹 Unique values in target column:
 pam50_+_claudin-low_subtype
LumA           679
LumB           461
Her2           220
claudin-low    199
Basal          199
Normal         140
NC               6
Name: count, dtype: int64


In [None]:
# Create a binary target variable (1 for TNBC, 0 for Non-TNBC)
df["tnbc_label"] = (df["pam50_+_claudin-low_subtype"] == "Basal").astype(int)

# Now, use this new column as 'y'
X = df.drop(columns=["pam50_+_claudin-low_subtype", "tnbc_label"])  # Drop non-numeric columns
y = df["tnbc_label"]  # Use the new binary target

print("\n✅ Class Distribution (TNBC vs. Non-TNBC):\n", y.value_counts())


✅ Class Distribution (TNBC vs. Non-TNBC):
 tnbc_label
0    1705
1     199
Name: count, dtype: int64


In [None]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Drop columns with NaN values
X = X.dropna(axis=1)

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Convert mixed-type columns to string
X[categorical_cols] = X[categorical_cols].astype(str)

# Apply Label Encoding to categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Store encoder for future use

# Now remove constant features (zero variance)
variance_filter = VarianceThreshold(threshold=0)
X = pd.DataFrame(variance_filter.fit_transform(X), columns=X.columns[variance_filter.get_support()])

print("✅ Constant features removed. Remaining features:", X.shape[1])

# Apply SelectKBest to choose the top 10 features
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print("✅ Selected Features:\n", selected_features)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[categorical_cols] = X[categorical_cols].astype(str)


✅ Constant features removed. Remaining features: 685
✅ Selected Features:
 Index(['er_status_measured_by_ihc', 'er_status', 'ccne1', 'cdc25a', 'cdkn2a',
       'e2f3', 'chek1', 'gata3', 'map2', 'ttyh1'],
      dtype='object')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test)

# Evaluate model
print("✅ Logistic Regression Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

✅ Logistic Regression Model Performance:
Accuracy: 0.9238845144356955
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       344
           1       0.64      0.49      0.55        37

    accuracy                           0.92       381
   macro avg       0.79      0.73      0.76       381
weighted avg       0.92      0.92      0.92       381



In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# 1️⃣ Split the dataset before applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2️⃣ Apply SMOTE only on training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# 🔹 NEW: Standardize features (Scaling)
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)  # Fit & transform train data
X_test = scaler.transform(X_test)  # Transform test data (DO NOT fit again)

# Check new class distribution
print("✅ Class Distribution After SMOTE:\n", pd.Series(y_train_balanced).value_counts())

# 3️⃣ Train Logistic Regression on balanced & scaled data
logreg = LogisticRegression(max_iter=5000, class_weight="balanced", random_state=42)
logreg.fit(X_train_balanced, y_train_balanced)

# 4️⃣ Test on the original test set (scaled)
y_pred_logreg = logreg.predict(X_test)

# 5️⃣ Evaluate performance
print("✅ Logistic Regression Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

✅ Class Distribution After SMOTE:
 tnbc_label
0    1364
1    1364
Name: count, dtype: int64
✅ Logistic Regression Model Performance:
Accuracy: 0.9291338582677166
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       341
           1       0.67      0.65      0.66        40

    accuracy                           0.93       381
   macro avg       0.81      0.81      0.81       381
weighted avg       0.93      0.93      0.93       381



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1️⃣ Train Random Forest on balanced data
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf.fit(X_train_balanced, y_train_balanced)

# 2️⃣ Test on original test set
y_pred_rf = rf.predict(X_test)

# 3️⃣ Evaluate performance
print("✅ Random Forest Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

✅ Random Forest Model Performance:
Accuracy: 0.9606299212598425
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       341
           1       0.84      0.78      0.81        40

    accuracy                           0.96       381
   macro avg       0.91      0.88      0.89       381
weighted avg       0.96      0.96      0.96       381



In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 1️⃣ Create an SVM pipeline with Standardization
svm_model = Pipeline([
    ('scaler', StandardScaler()),  # Feature Scaling
    ('svm', SVC(kernel='linear', class_weight='balanced', random_state=42))
])

# 2️⃣ Train on the SMOTE-balanced dataset
svm_model.fit(X_train_balanced, y_train_balanced)

# 3️⃣ Predict on the original test set
y_pred_svm = svm_model.predict(X_test)

# 4️⃣ Evaluate performance
print("✅ SVM (Linear) Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

✅ SVM (Linear) Model Performance:
Accuracy: 0.9291338582677166
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       341
           1       0.69      0.60      0.64        40

    accuracy                           0.93       381
   macro avg       0.82      0.78      0.80       381
weighted avg       0.93      0.93      0.93       381



In [None]:
from sklearn.linear_model import LogisticRegression

lasso = LogisticRegression(penalty="l1", solver="liblinear", max_iter=5000)
lasso.fit(X_train_balanced, y_train_balanced)

y_pred_lasso = lasso.predict(X_test)

print("✅ Lasso Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lasso))
print(classification_report(y_test, y_pred_lasso))

✅ Lasso Logistic Regression Performance:
Accuracy: 0.9343832020997376
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       341
           1       0.67      0.75      0.71        40

    accuracy                           0.93       381
   macro avg       0.82      0.85      0.83       381
weighted avg       0.94      0.93      0.94       381



In [None]:
from sklearn.linear_model import RidgeClassifier

ridge = RidgeClassifier(alpha=1.0)  # Alpha controls the regularization strength
ridge.fit(X_train_balanced, y_train_balanced)

y_pred_ridge = ridge.predict(X_test)

print("✅ Ridge Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_ridge))
print(classification_report(y_test, y_pred_ridge))


✅ Ridge Classifier Performance:
Accuracy: 0.9028871391076115
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       341
           1       0.53      0.62      0.57        40

    accuracy                           0.90       381
   macro avg       0.74      0.78      0.76       381
weighted avg       0.91      0.90      0.91       381

