In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import learning_curve, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight


In [5]:
def data_preprocess(df):
    y = df["cancer_type"]
    label_encoder = LabelEncoder();
    y  = label_encoder.fit_transform(y)
    y = pd.Series(y)
    df = df.drop('cancer_type', axis = 1)

    # label encoding for cellularity 40 nan values transformed to 0
    mapping = {
        'Low': 1,
        'Moderate': 2,
        'High': 3,
    }
    df['cellularity'] = df['cellularity'].str.strip()
    df["cellularity"] = df["cellularity"].map(mapping)
    df["cellularity"] = df["cellularity"].fillna(0)


    # dropping patient_id (irrelevant info)
    df = df.drop('patient_id', axis=1)

    #label encoding pam50_+_claudin-low_subtype
    df['pam50_+_claudin-low_subtype'] =label_encoder.fit_transform( df['pam50_+_claudin-low_subtype'])

    df['er_status'] =label_encoder.fit_transform( df['er_status'])

    df['er_status_measured_by_ihc'] = label_encoder.fit_transform(df['er_status_measured_by_ihc'])

    df['her2_status'] = label_encoder.fit_transform(df['her2_status'])

    her2_mapping={
    'LOSS' : 0,
    'NEUTRAL' : 1,
    'GAIN' : 3,
    'UNDEF' : 1
    }

    df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].str.strip()
    df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].map(her2_mapping)

    df['inferred_menopausal_state'] = label_encoder.fit_transform(df['inferred_menopausal_state'])

    map_laterality = {
    'Right':1,
    'Left':-1,
    }
    df['primary_tumor_laterality'] = df['primary_tumor_laterality'].str.strip()
    df['primary_tumor_laterality'] = df['primary_tumor_laterality'].map(map_laterality)
    df['primary_tumor_laterality'] = df['primary_tumor_laterality'].fillna(0)

    df['pr_status'] = label_encoder.fit_transform(df['pr_status'])

    df = pd.get_dummies(df, columns=['3-gene_classifier_subtype'])

    df = pd.get_dummies(df, columns=['death_from_cancer'])

    tumor_mean = df['tumor_size'].mean()
    df["tumor_size"] = df["tumor_size"].fillna(tumor_mean)

    mutation_mean = df['mutation_count'].mean()
    df['mutation_count'] = df['mutation_count'].fillna(mutation_mean)

    df['neoplasm_histologic_grade'] = df['neoplasm_histologic_grade'].fillna(3)

    majority_value = df['tumor_stage'].mode()[0]
    df['tumor_stage'].fillna(majority_value, inplace=True)
    df['tumor_stage']=label_encoder.fit_transform(df['tumor_stage'])

    label_encoders = {}

    for column in df.columns:
        if df[column].dtype == 'object':
            # Create a label encoder for each categorical column
            le = LabelEncoder()

            # Fit the label encoder and transform the data
            df[column] = le.fit_transform(df[column].astype(str))

            # Store the label encoder in a dictionary in case you need to reverse the encoding or use it later
            label_encoders[column] = le
    
    last_seven = df.iloc[:, -7:]
    part_before = df.iloc[:, :2]  # Columns up to the 19th (0-based index, so it includes columns 0-18)
    part_after = df.iloc[:, 2:]
    df = pd.concat([part_before, last_seven, part_after], axis=1)
    df = df.iloc[:, :-7]


    
    return df,y









In [6]:
df = pd.read_csv('data.csv')
X, y = data_preprocess(df)

  df = pd.read_csv('data.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['tumor_stage'].fillna(majority_value, inplace=True)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [8]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

In [9]:
def warmup_classification(X_train, y_train, weights, multiclass=False):
    # Logistic regression
    param_grid_logistic = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }
    log_r = LogisticRegression(random_state=42, max_iter=1000, class_weight=weights)
    if multiclass == True :
        log_r = LogisticRegression(multi_class='multinomial', random_state=42, max_iter=1000, class_weight=weights)
    grid_search_logistic = GridSearchCV(
        log_r, 
        param_grid= param_grid_logistic, 
        cv=5, 
        scoring='accuracy'
    )
    grid_search_logistic.fit(X_train, y_train)
    best_model_logistic = grid_search_logistic.best_estimator_

    # Decision tree
    param_grid_dt = {
        'criterion': ['gini', 'entropy'],
        'max_depth': range(1,20),
        'min_samples_split': range(2,21),
        'min_samples_leaf': range(1,21)
    }
    grid_search_dt = GridSearchCV(
        DecisionTreeClassifier(random_state=42, class_weight=weights),
        param_grid= param_grid_dt,
        cv = 5,
        scoring = 'accuracy',
        n_jobs= -1
    )
    grid_search_dt.fit(X_train, y_train) 
    best_model_dt = grid_search_dt.best_estimator_
    
    # Gradient Boosting
    sample_weights = compute_sample_weight(class_weight=weights, y=y_train)
    param_grid_gb = {
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 4],
        'min_samples_leaf': [1, 2]
    }
    grid_search_gb = GridSearchCV(
        GradientBoostingClassifier(random_state=42),
        param_grid= param_grid_gb,
        cv= 5,
        n_jobs=-1,
        scoring='accuracy'
    )
    grid_search_gb.fit(X_train, y_train, sample_weight=sample_weights)
    best_model_gb = grid_search_gb.best_estimator_

    #Random forest
    param_grid_rf = {
        'n_estimators': [100, 200, 300],  
        'max_depth': [None, 10, 20, 30],  
        'min_samples_split': [2, 5, 10],  
        'min_samples_leaf': [1, 2, 4],    
        'bootstrap': [True, False]
    }
    grid_search_rf = GridSearchCV(
        RandomForestClassifier(random_state=42, class_weight=weights),
        param_grid= param_grid_rf,
        cv=5,
        verbose=2,
        n_jobs=-1
    )
    grid_search_rf.fit(X_train, y_train)
    best_model_rf = grid_search_rf.best_estimator_

   
    

    return best_model_logistic, best_model_dt, best_model_gb, best_model_rf 




In [10]:
best_model_logistic, best_model_dt, best_model_gb, best_model_rf = warmup_classification(X_train, y_train,class_weight_dict, True)



In [12]:
def visualization_class(logistic = best_model_logistic, decision_tree=best_model_dt, gradient_boost=best_model_gb, random_forest=best_model_rf, X_train=X_train, y_train=y_train, X_val=X_test, y_val=y_test):
    predictions_train = logistic.predict(X_train)
    predictions_val = logistic.predict(X_val)
    logistic_train = accuracy_score(predictions_train,y_train)
    logistic_val = accuracy_score(predictions_val, y_val)
    print("\033[1m" + "Logistic Regression" + "\033[0m"+":")
    print('train: ',logistic_train*100,'%')
    print('val: ',logistic_val*100,'%')

    predictions_train = decision_tree.predict(X_train)
    predictions_val = decision_tree.predict(X_val)
    dt_train = accuracy_score(predictions_train,y_train)
    dt_val = accuracy_score(predictions_val, y_val)
    print("\033[1m" + "Decision Tree" + "\033[0m"+":")
    print('train: ',dt_train*100,'%')
    print('val: ',dt_val*100,'%')

    predictions_train = gradient_boost.predict(X_train)
    predictions_val = gradient_boost.predict(X_val)
    gb_train = accuracy_score(predictions_train,y_train)
    gb_val = accuracy_score(predictions_val, y_val)
    print("\033[1m" + "Gradient Boost" + "\033[0m"+":")
    print('train: ',gb_train*100,'%')
    print('val: ',gb_val*100,'%')

    predictions_train = random_forest.predict(X_train)
    predictions_val = random_forest.predict(X_val)
    rf_train = accuracy_score(predictions_train,y_train)
    rf_val = accuracy_score(predictions_val, y_val)
    print("\033[1m" + "Random Forest" + "\033[0m"+":")
    print('train: ',rf_train*100,'%')
    print('val: ',rf_val*100,'%')
    


    

In [13]:
visualization_class()

[1mLogistic Regression[0m:
train:  42.4962852897474 %
val:  36.0 %
[1mDecision Tree[0m:
train:  99.10846953937593 %
val:  62.0 %
[1mGradient Boost[0m:
train:  100.0 %
val:  74.0 %
[1mRandom Forest[0m:
train:  100.0 %
val:  78.66666666666666 %


In [9]:

param_dist_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'poly', 'sigmoid', 'linear']
}

svm = SVC(random_state=42,decision_function_shape='ovr', class_weight=class_weight_dict)
random_search_svm = RandomizedSearchCV(
    svm, 
    param_distributions=param_dist_svm, 
    n_iter=10,  # You can adjust the number of iterations
    refit=True, 
    verbose=3, 
    cv=5, 
    n_jobs=-1,
    random_state=42  # It's good to set a random_state for reproducibility
)
random_search_svm.fit(X_train, y_train)
svm = random_search_svm.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [10]:

predictions_train = svm.predict(X_train)
predictions_val = svm.predict(X_test)
svm_train = accuracy_score(predictions_train,y_train)
svm_val = accuracy_score(predictions_val, y_test)
print("\033[1m" + "SVM" + "\033[0m"+":")
print('train: ',svm_train*100,'%')
print('val: ',svm_val*100,'%')
    

[1mSVM[0m:
train:  100.0 %
val:  78.66666666666666 %
