In [39]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier 



In [40]:
!pip install lazypredict



# Read The Data

In [41]:
df = pd.read_csv('Data\BreasetConser.csv')

**shows the first 5 columns**  

In [42]:
df.head(5)

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,42,FEMALE,0.95,2.15,0.01,-0.05,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,2018-05-20,2018-08-26,Alive
1,54,FEMALE,0.0,1.38,-0.5,-0.51,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,2018-04-26,2019-01-25,Dead
2,63,FEMALE,-0.52,1.76,-0.37,0.01,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Lumpectomy,2018-08-24,2020-04-08,Alive
3,78,FEMALE,-0.88,0.13,-0.37,0.13,I,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,2018-11-16,2020-07-28,Alive
4,42,FEMALE,0.23,1.75,-0.54,-0.39,II,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,2018-12-12,2019-01-05,Alive


**Preparing X and Y** 

**Categories columns**

In [43]:
print("Categories in 'gender' variable:  ",end=" ")
print(df['Gender'].unique())
print("Categories in 'Tumor stage' variable:  ",end=" ")
print(df['Tumour_Stage'].unique())
print("Categories in 'Histology' variable:  ",end=" ")
print(df['Histology'].unique())
print("Categories in 'ER status' variable:  ",end=" ")
print(df['ER status'].unique())
print("Categories in 'PR status' variable:  ",end=" ")
print(df['PR status'].unique())
print("Categories in 'HER2 status' variable:  ",end=" ")
print(df['HER2 status'].unique())
print("Categories in 'Surgery Type' variable:  ",end=" ")
print(df['Surgery_type'].unique())

Categories in 'gender' variable:   ['FEMALE' 'MALE']
Categories in 'Tumor stage' variable:   ['II' 'I' 'III']
Categories in 'Histology' variable:   ['Infiltrating Ductal Carcinoma' 'Infiltrating Lobular Carcinoma'
 'Mucinous Carcinoma']
Categories in 'ER status' variable:   ['Positive']
Categories in 'PR status' variable:   ['Positive']
Categories in 'HER2 status' variable:   ['Negative' 'Positive']
Categories in 'Surgery Type' variable:   ['Other' 'Lumpectomy' 'Modified Radical Mastectomy' 'Simple Mastectomy']


In [44]:
df['Date_of_Last_Visit']=pd.to_datetime(df['Date_of_Last_Visit'])
df['Date_of_Surgery']= pd.to_datetime(df['Date_of_Surgery'])

In [45]:
df.head()

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,42,FEMALE,0.95,2.15,0.01,-0.05,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,2018-05-20,2018-08-26,Alive
1,54,FEMALE,0.0,1.38,-0.5,-0.51,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,2018-04-26,2019-01-25,Dead
2,63,FEMALE,-0.52,1.76,-0.37,0.01,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Lumpectomy,2018-08-24,2020-04-08,Alive
3,78,FEMALE,-0.88,0.13,-0.37,0.13,I,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,2018-11-16,2020-07-28,Alive
4,42,FEMALE,0.23,1.75,-0.54,-0.39,II,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,2018-12-12,2019-01-05,Alive


In [46]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [47]:
df_processing = df.drop(['ER status','PR status',],axis=1)

In [48]:
df_processing.sample(1)

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
113,74,FEMALE,-0.27,1.27,0.38,-0.01,II,Infiltrating Lobular Carcinoma,Negative,Modified Radical Mastectomy,2019-06-14,2019-06-20,Alive


In [49]:
#Encoder Tumor stage,histology and surgery type with ordinalencoder 

ordinal_encoder = OrdinalEncoder(categories=[['I','II','III'],
                                             ['Infiltrating Ductal Carcinoma','Infiltrating Lobular Carcinoma','Mucinous Carcinoma'],
                                             ['Other','Lumpectomy','Modified Radical Mastectomy','Simple Mastectomy']])

df_processing[['Tumour_Stage','Histology','Surgery_type']] = ordinal_encoder.fit_transform(df_processing[['Tumour_Stage','Histology','Surgery_type']])

In [50]:
df_processing.sample()

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
165,85,FEMALE,0.41,0.33,-0.26,-0.65,1.0,0.0,Negative,0.0,2018-11-06,2021-01-17,Alive


In [51]:
#Encoder Gender and HER2 stutus with getdummies (binary encoding)
df_processing = pd.get_dummies(data=df_processing,columns=['Gender','HER2 status'],drop_first=True)

In [52]:
df_processing.head()

Unnamed: 0,Age,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status,Gender_MALE,HER2 status_Positive
0,42,0.95,2.15,0.01,-0.05,1.0,0.0,0.0,2018-05-20,2018-08-26,Alive,False,False
1,54,0.0,1.38,-0.5,-0.51,1.0,0.0,0.0,2018-04-26,2019-01-25,Dead,False,False
2,63,-0.52,1.76,-0.37,0.01,1.0,0.0,1.0,2018-08-24,2020-04-08,Alive,False,False
3,78,-0.88,0.13,-0.37,0.13,0.0,0.0,0.0,2018-11-16,2020-07-28,Alive,False,False
4,42,0.23,1.75,-0.54,-0.39,1.0,0.0,1.0,2018-12-12,2019-01-05,Alive,False,True


In [53]:
#extract day,monthm,year from Date_of_Last_Visit
df_processing['year_Visit'] = df_processing['Date_of_Last_Visit'].dt.year
df_processing['month_Visit'] = df_processing['Date_of_Last_Visit'].dt.month
df_processing['day_Visit'] = df_processing['Date_of_Last_Visit'].dt.day
df_processing.drop(columns='Date_of_Last_Visit',axis=1,inplace=True)

#extract day,month,year from Date_of_Last_Visit
df_processing['year_Surgery'] = df_processing['Date_of_Surgery'].dt.year
df_processing['month_Surgery'] = df_processing['Date_of_Surgery'].dt.month
df_processing['day_Surgery'] = df_processing['Date_of_Surgery'].dt.day
df_processing.drop(columns='Date_of_Surgery',axis=1,inplace=True)


In [54]:
df_processing.head()

Unnamed: 0,Age,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,Surgery_type,Patient_Status,Gender_MALE,HER2 status_Positive,year_Visit,month_Visit,day_Visit,year_Surgery,month_Surgery,day_Surgery
0,42,0.95,2.15,0.01,-0.05,1.0,0.0,0.0,Alive,False,False,2018,8,26,2018,5,20
1,54,0.0,1.38,-0.5,-0.51,1.0,0.0,0.0,Dead,False,False,2019,1,25,2018,4,26
2,63,-0.52,1.76,-0.37,0.01,1.0,0.0,1.0,Alive,False,False,2020,4,8,2018,8,24
3,78,-0.88,0.13,-0.37,0.13,0.0,0.0,0.0,Alive,False,False,2020,7,28,2018,11,16
4,42,0.23,1.75,-0.54,-0.39,1.0,0.0,1.0,Alive,False,True,2019,1,5,2018,12,12


In [55]:
df_processing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   317 non-null    int64  
 1   Protein1              317 non-null    float64
 2   Protein2              317 non-null    float64
 3   Protein3              317 non-null    float64
 4   Protein4              317 non-null    float64
 5   Tumour_Stage          317 non-null    float64
 6   Histology             317 non-null    float64
 7   Surgery_type          317 non-null    float64
 8   Patient_Status        317 non-null    object 
 9   Gender_MALE           317 non-null    bool   
 10  HER2 status_Positive  317 non-null    bool   
 11  year_Visit            317 non-null    int32  
 12  month_Visit           317 non-null    int32  
 13  day_Visit             317 non-null    int32  
 14  year_Surgery          317 non-null    int32  
 15  month_Surgery         3

In [56]:
df_processing = df_processing.astype({'Gender_MALE':int,'HER2 status_Positive':int})

In [57]:
numerical_features = [col for col in df_processing.columns if df_processing[col].dtype in['int32','float64','int64']]
numerical_features

['Age',
 'Protein1',
 'Protein2',
 'Protein3',
 'Protein4',
 'Tumour_Stage',
 'Histology',
 'Surgery_type',
 'Gender_MALE',
 'HER2 status_Positive',
 'year_Visit',
 'month_Visit',
 'day_Visit',
 'year_Surgery',
 'month_Surgery',
 'day_Surgery']

In [58]:
numric_processeur = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy='mean')),
    ("scale",StandardScaler())
])
catg_processeur = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy='most_frequient')),
    ("onehot",OneHotEncoder(handle_unknown='ignore',sparse=False))
])

TypeError: __init__() got an unexpected keyword argument 'sparse'

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('nmeric',numric_processeur,numerical_features)

    ]
)

In [None]:
X = df_processing.drop(columns='Patient_Status',axis=1)
y= df_processing['Patient_Status']

In [None]:
from sklearn.model_selection import train_test_split 

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=32)

In [None]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

In [None]:
X_train.shape,X_test.shape

((253, 16), (64, 16))

In [None]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_score_val = r2_score(true,predicted) 

    return mae,rmse,r2_score_val


In [None]:
models = {
    "LogisticRegression":LogisticRegression(),
    "RandomForestClassifier":RandomForestClassifier(),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    'SVC':SVC(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'XGBClassifier':XGBClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "CatBoostClassifier":CatBoostClassifier(verbose=False),
}

In [None]:
model_list = []
r2_list = []
for i in range(len(list(models))): 
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    #make prediction 

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Evaluate Train and Test dataset: 
    
    model_train_mae , model_train_rmse ,model_train_r2 = evaluate_model(y_train,y_train_pred)
    model_test_mae , model_test_rmse ,model_test_r2 = evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i]) 
    model_list.append(list(models.keys())[i])

    #print('Model performance for training set')
    #print("- RMS Error : {:.2f}".format(model_train_rmse))
    #print("- MAE Error : {:.2f}".format(model_train_mae))
    #print("- R2 Score : {:.2f}".format(model_train_r2))
   

    print("------------------------------------------------")

    print('Model performance for test set')
    print("- RMS Error : {:.2f}".format(model_test_rmse))
    print("- MAE Error : {:.2f}".format(model_test_mae))
    print("- R2 Score : {:.2f}".format(model_test_r2))
    print("------------------------------------------------")



    



LogisticRegression
------------------------------------------------
Model performance for test set
- RMS Error : 0.47
- MAE Error : 0.22
- R2 Score : -0.44
------------------------------------------------
RandomForestClassifier
------------------------------------------------
Model performance for test set
- RMS Error : 0.45
- MAE Error : 0.20
- R2 Score : -0.33
------------------------------------------------
DecisionTreeClassifier
------------------------------------------------
Model performance for test set
- RMS Error : 0.60
- MAE Error : 0.36
- R2 Score : -1.36
------------------------------------------------
SVC
------------------------------------------------
Model performance for test set
- RMS Error : 0.43
- MAE Error : 0.19
- R2 Score : -0.23
------------------------------------------------
KNeighborsClassifier
------------------------------------------------
Model performance for test set
- RMS Error : 0.50
- MAE Error : 0.25
- R2 Score : -0.64
-----------------------------

In [None]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbos = 0,ignore_warning = True,custom_metric =None)
model,prediction = clf.fit(X_train,X_test,y_train,y_test)
print(prediction)

TypeError: __init__() got an unexpected keyword argument 'sparse'