##### Introduction 

- This is one of the many datasets from Curated Microarray Database (CuMiDa),solely for machine learning.
- The dataset below is a breat cancer gene expression dataset containing the following:

    - 6 classes
    - 54676 genes
    - 151 samples


In [1]:
import pandas as pd
import numpy as np

In [2]:
# loading the dataset
cancer_df= pd.read_csv('/media/danlof/dan files/data_science_codes/PROJECTS/Breast_GSE45827.csv')
cancer_df.head()

Unnamed: 0,samples,type,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,84,basal,9.85004,8.097927,6.424728,7.353027,3.029122,6.880079,4.96374,4.408328,...,12.229711,11.852955,13.658701,13.477698,6.265781,5.016196,4.901594,2.966657,3.508495,3.301999
1,85,basal,9.861357,8.212222,7.062593,7.685578,3.149468,7.542283,5.129607,4.584418,...,12.178531,11.809408,13.750086,13.470146,6.771853,5.291005,5.405839,2.934763,3.687666,3.064299
2,87,basal,10.103478,8.936137,5.73597,7.687822,3.125931,6.562369,4.813449,4.425195,...,12.125108,11.725766,13.621732,13.29508,6.346952,5.171403,5.184286,2.847684,3.550597,3.158535
3,90,basal,9.756875,7.357148,6.479183,6.986624,3.181638,7.802344,5.490982,4.567956,...,12.111235,11.719215,13.743108,13.508861,6.610284,5.193356,5.086569,3.031602,3.524981,3.272665
4,91,basal,9.40833,7.746404,6.69398,7.333426,3.169923,7.610457,5.372469,4.424426,...,12.173642,11.861296,13.797774,13.542206,6.414354,5.040202,5.235318,2.956232,3.445501,3.193947


In [3]:
# used to show the dataframe structure,no. of non-null values,
#data types of columns and memory usage
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Columns: 54677 entries, samples to AFFX-TrpnX-M_at
dtypes: float64(54675), int64(1), object(1)
memory usage: 63.0+ MB


In [4]:
# Understanding more on the type of the cancer 
# the breast cancers are group to either one of the following groups
# except the normal, with is not a type.

cancer_df.type.value_counts()

basal        41
HER          30
luminal_B    30
luminal_A    29
cell_line    14
normal        7
Name: type, dtype: int64

In [5]:
# checking to see if the type column has any missing values 
cancer_df.type.isnull().sum()

0

In [14]:
len(cancer_df)

151

In [11]:
from sklearn.preprocessing import OneHotEncoder
# drop the type column
df=cancer_df.drop('type',axis=1)
# since we got categorical values we could one encode them(norminal values)

dummy=pd.get_dummies(cancer_df[['type']])

# concatenate them 
cancer= pd.concat([dummy,df],axis=1)
cancer.head()


Unnamed: 0,type_HER,type_basal,type_cell_line,type_luminal_A,type_luminal_B,type_normal,samples,1007_s_at,1053_at,117_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,0,1,0,0,0,0,84,9.85004,8.097927,6.424728,...,12.229711,11.852955,13.658701,13.477698,6.265781,5.016196,4.901594,2.966657,3.508495,3.301999
1,0,1,0,0,0,0,85,9.861357,8.212222,7.062593,...,12.178531,11.809408,13.750086,13.470146,6.771853,5.291005,5.405839,2.934763,3.687666,3.064299
2,0,1,0,0,0,0,87,10.103478,8.936137,5.73597,...,12.125108,11.725766,13.621732,13.29508,6.346952,5.171403,5.184286,2.847684,3.550597,3.158535
3,0,1,0,0,0,0,90,9.756875,7.357148,6.479183,...,12.111235,11.719215,13.743108,13.508861,6.610284,5.193356,5.086569,3.031602,3.524981,3.272665
4,0,1,0,0,0,0,91,9.40833,7.746404,6.69398,...,12.173642,11.861296,13.797774,13.542206,6.414354,5.040202,5.235318,2.956232,3.445501,3.193947


In [60]:
# divide the data into test and train sets 
from sklearn.model_selection import train_test_split

# this separates the features X from the target variables y
X,y = cancer.iloc[:,6:].values,cancer.iloc[:,0:6].values

# split the dataset in the following subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                 test_size=0.3,stratify=y,
                                                 random_state=0)

#####  1.Implementing LDA via scikit_learn

In [61]:
from sklearn.preprocessing import StandardScaler

# simplify the scaler for easy usage
sc= StandardScaler()

#standardizing the train set
X_train_std=sc.fit_transform(X_train)

# standardize the test dataset
X_test_std=sc.transform(X_test)

In [62]:

# y_train to 1D array
y_t=np.argmax(y_train, axis=1)

In [63]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# set n_components to 5 because we want to classify this data set(c-1)
lda= LDA(n_components=2)

X_train_lda= lda.fit_transform(X_train_std,y_t)
X_test_lda = lda.transform(X_test)

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# train a random forest classifier

rfc= RandomForestClassifier(n_estimators=200,random_state=0)
# create a multioutput classifier

multi= MultiOutputClassifier(rfc)
# training
multi.fit(X_train_lda,y_train)

# make predictions 

y_pred= multi.predict(X_test_lda)

#accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.39
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.52      0.92      0.67        13
           2       1.00      1.00      1.00         4
           3       0.00      0.00      0.00         9
           4       0.22      0.22      0.22         9
           5       0.00      0.00      0.00         2

   micro avg       0.50      0.39      0.44        46
   macro avg       0.29      0.36      0.31        46
weighted avg       0.28      0.39      0.32        46
 samples avg       0.39      0.39      0.39        46



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
auc_y1 = roc_auc_score(y_test,y_pred)
auc_y1

0.6352313852313852

In [66]:
from sklearn.pipeline import Pipeline
# checking for the case of XGBoost classifier

classifier = MultiOutputClassifier(XGBClassifier())
clf = Pipeline([('classify', classifier)
               ])

#fitting

clf.fit(X_train_lda,y_train)

#make predictions

y_hat=clf.predict(X_test_lda)

#accuracy score
accuracy = accuracy_score(y_test, y_hat)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.24


In [67]:
from sklearn.metrics import roc_auc_score
auc_y1 = roc_auc_score(y_test,y_hat)
auc_y1

0.5858043358043358

In [68]:
# Generate a classification report
report = classification_report(y_test, y_hat)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.40      0.46      0.43        13
           2       1.00      1.00      1.00         4
           3       0.00      0.00      0.00         9
           4       0.10      0.11      0.11         9
           5       0.00      0.00      0.00         2

   micro avg       0.37      0.24      0.29        46
   macro avg       0.25      0.26      0.26        46
weighted avg       0.22      0.24      0.23        46
 samples avg       0.24      0.24      0.24        46



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
