In [None]:
import sys
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


In [None]:
def pre_processing_data(path):
  #read csv as df
  breastCancerDataSet=pd.read_csv(path)
  #extrac attributes and label
  breastCancerDataSet_attributes=breastCancerDataSet.iloc[:,:-1]
  breastCancerDataSet_Y=breastCancerDataSet.iloc[:,-1]
  #transform value to numeric value
  breastCancerDataSet_attributes.replace('?',np.NaN,inplace=True)
  breastCancerDataSet_Y.replace('class1',0,inplace=True)
  breastCancerDataSet_Y.replace('class2',1,inplace=True)
  imp_mean = SimpleImputer(missing_values=np.NaN, strategy='mean',verbose=4)
  imp_mean.fit(breastCancerDataSet_attributes)
  breastCancerDataSet_attributes=imp_mean.transform(breastCancerDataSet_attributes)
  #Normalise the attributes
  sacler=MinMaxScaler()
  sacler.fit(breastCancerDataSet_attributes)
  breastCancerDataSet_attributes=sacler.transform(breastCancerDataSet_attributes)
  #round value with 4 decimal
  breastCancerDataSet_attributes=np.round( breastCancerDataSet_attributes,decimals=4)
  return  breastCancerDataSet_attributes,breastCancerDataSet_Y

In [None]:
def kNNClassifier(X, y, K):
  knn=KNeighborsClassifier(n_neighbors=K)
  cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  scores=cross_val_score(knn,X,y,cv=cvKFold)
  return np.round(scores.mean(),decimals=4)

In [None]:
def logregClassifier(X, y):
  lr=LogisticRegression()
  cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  scores=cross_val_score(lr,X,y,cv=cvKFold)
  return np.round(scores.mean(),decimals=4)

In [None]:
def nbClassifier(X, y):
  gnb=GaussianNB()
  cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  scores=cross_val_score(gnb,X,y,cv=cvKFold)
  return np.round(scores.mean(),decimals=4)            

In [None]:
def dtClassifier(X, y):
  dt=DecisionTreeClassifier(criterion='entropy',random_state=0)
  cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  scores=cross_val_score(dt,X,y,cv=cvKFold)
  return np.round(scores.mean(),decimals=4)   

In [None]:
def bagDTClassifier(X, y, n_estimators=100, max_samples=100, max_depth=2):
  bag_clf=BaggingClassifier(DecisionTreeClassifier(random_state=0,max_depth=max_depth,criterion='entropy'),n_estimators=n_estimators,max_samples=max_samples,
                            random_state=0)
  cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  scores=cross_val_score(bag_clf,X,y,cv=cvKFold)
  return np.round(scores.mean(),decimals=4)   
  

In [None]:
def adaDTClassifier(X, y, n_estimators=100, learning_rate=0.2, max_depth=3):
  ada_clf=AdaBoostClassifier(DecisionTreeClassifier(random_state=0,max_depth=max_depth,criterion='entropy'),
                             n_estimators=n_estimators,
                             learning_rate=learning_rate,
                              random_state=0)
  cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  scores=cross_val_score(ada_clf,X,y,cv=cvKFold)
  return np.round(scores.mean(),decimals=4)   

In [None]:
def gbClassifier(X, y, n_estimators=100, learning_rate=0.2):
  gb_clf=GradientBoostingClassifier(n_estimators=n_estimators,
                                    learning_rate=learning_rate,
                                    random_state=0)
  cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  scores=cross_val_score(gb_clf,X,y,cv=cvKFold)
  return np.round(scores.mean(),decimals=4)   

In [None]:
def bestLinClassifier(X,y):
  X_train,X_test,Y_train,Y_test=train_test_split(X,y,stratify=y,random_state=0)
  param_grid={
      'C': [0.001, 0.01, 0.1, 1, 10, 100],
      'gamma':[0.001, 0.01, 0.1, 1, 10, 100]
  }
  cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  grid_search=GridSearchCV(SVC(kernel='linear',random_state=0),param_grid,cv=cvKFold,return_train_score=True)
  grid_search.fit(X_train,Y_train)
  return grid_search.best_params_.get('C'),grid_search.best_params_.get('gamma'),np.round(grid_search.best_score_,decimals=4),np.round(grid_search.score(X_test,Y_test),decimals=4)
              

In [None]:
def bestRFClassifier(X,y):
  X_train,X_test,Y_train,Y_test=train_test_split(X,y,stratify=y,random_state=0)
  param_grid={
        "n_estimators" : [10, 20, 50, 100],
        "max_features" : ['auto', 'sqrt', 'log2'],
        "max_leaf_nodes" : [10, 20, 30]
  }
  cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  grid_search=GridSearchCV(RandomForestClassifier(criterion='entropy',random_state=0),param_grid,cv=cvKFold,return_train_score=True)
  grid_search.fit(X_train,Y_train)  
  return grid_search.best_params_.get('n_estimators'),grid_search.best_params_.get('max_features'),grid_search.best_params_.get('max_leaf_nodes'),np.round(grid_search.best_score_,decimals=4),np.round(grid_search.score(X_test,Y_test),decimals=4)

                

In [None]:
def main(argv):
  try:
    dataPath,operation=argv[1],argv[2]
  

    if dataPath is not '':
      # breastCancerDataSet=pd.read_csv(dataPath)
      # preDataX=breastCancerDataSet.iloc[:,:9]
      # preDataY=breastCancerDataSet.iloc[:,9]
      preDataX,preDataY=pre_processing_data(dataPath)
    else:
      return
    
    if 'NN'== operation:
      csv=pd.read_csv(argv[3])
      K=csv.iloc[0,0]
      res=kNNClassifier(preDataX,preDataY,K)
      print(res)
    elif 'P'== operation:
      #preDataX,preDataY=pre_processing_data(dataPath)
      items=[pd.DataFrame(preDataX),pd.DataFrame(preDataY)]
      k=pd.concat(items, axis=1, sort=False)
      res=k.to_csv(header=False,index=False,float_format='%.4f')
      print(res,end='')
    elif 'LR'== operation:
      res=logregClassifier(preDataX,preDataY)
      print(res)
    elif 'NB'== operation:
      res=nbClassifier(preDataX,preDataY)
      print(res)
    elif 'DT'== operation:
      res=dtClassifier(preDataX,preDataY)
      print(res)
    elif 'BAG'== operation:
      csv=pd.read_csv(argv[3])
      n_estimators,max_samples,max_depth=csv.iloc[0,0],csv.iloc[0,1],csv.iloc[0,2]
      res=bagDTClassifier(preDataX,preDataY,n_estimators,max_samples,max_depth)
      print(res)
    elif 'ADA'== operation:
      csv=pd.read_csv(argv[3])
      n_estimators,learning_rate,max_depth=csv.iloc[0,0],csv.iloc[0,1],csv.iloc[0,2]
      res=adaDTClassifier(preDataX,preDataY,n_estimators,learning_rate,max_depth)
      print(res)
    elif 'GB'== operation:
      csv=pd.read_csv(argv[3])
      n_estimators,learning_rate=csv.iloc[0,0],csv.iloc[0,1]
      res=gbClassifier(preDataX,preDataY,n_estimators,learning_rate)
      print(res)
    elif 'SVM'== operation: 
      a,b,c,d=bestLinClassifier(preDataX,preDataY)
      print(a,b,c,d,sep='\n',end='')
    elif 'RF'== operation:
      a,b,c,d,e=bestRFClassifier(preDataX,preDataY)
      print(a,b,c,d,e,sep='\n',end='')

  except Exception as e:
    print("An exception occurred :",e)



if __name__ == "__main__":
    main(sys.argv)

FileNotFoundError: ignored