In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [2]:
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from xgboost import XGBClassifier
import pickle

# Read Data

In [3]:
def read_data(path):

  df =pd.read_csv(path,index_col='ID')
  print(df.shape) 
  # Target Manual Encoding case train file
  df.replace({'y':{'BARBUNYA':1,'BOMBAY':2,'CALI':3,'DERMASON':4,'HOROZ':5,'SEKER':6,'SIRA':7}},inplace=True)
  return df 

# Preprocessing

>## Upsampling

In [4]:
def apply_Upsmote(df):
  # Upsample each class to 3000 sample
  data = df.values
  x, y = data[:, :-1], data[:, -1]

  # set each label to 3000
  strategy = {1:3000, 2:3000, 3:3000, 4:3000, 5:3000, 6:3000,7:3000}
  oversample = SMOTE(sampling_strategy=strategy)
  features, target = oversample.fit_resample(x, y)

  # convert to Dataframe
  oversampled_data =pd.concat([ pd.DataFrame(features),pd.DataFrame(target)], axis=1)
  oversampled_data[0]=oversampled_data[0].astype(int) # set ID col as int
  oversampled_data.columns= list(df.columns)
 
  return oversampled_data

In [5]:
def get_numofsamples(df):
  labels=df.y.unique()
  for label in labels:  
    print('Number of samples in class {} = {}'.format(label,len(df[df.y == label])))


>## Cross Validation

In [6]:
def apply_stratifiedKFold(oversampled_data):
  data = oversampled_data.values
  x, y = data[:, :-1], data[:, -1]
  skf = StratifiedKFold(n_splits=5)
  skf.get_n_splits(x, y)
  col_names = oversampled_data.columns.to_list()
  StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
  train_dfs,valid_dfs =list(),list()
 
  for train_index, test_index in skf.split(x, y):
      #print("TRAIN:", train_index, "TEST:", test_index)
      X_train, X_test = x[train_index], x[test_index]
      y_train, y_test = y[train_index], y[test_index]
      trainX = pd.DataFrame(X_train)
      trainy= pd.DataFrame(y_train)

      testX= pd.DataFrame(X_test)
      testy=pd.DataFrame(y_test)

      train=pd.concat([trainX,trainy],axis=1)
      test=pd.concat([testX,testy],axis=1)
      
      train.columns=col_names

      test.columns=col_names
    
      train_dfs.append(train)
      valid_dfs.append(test)

  return train_dfs ,valid_dfs



>## Scaling

In [7]:
def apply_MinMaxScaler(X):
  scaler = MinMaxScaler()
  scaler.fit(X)
  return scaler


>## Features Extraction

In [8]:
def apply_pca(scaled_X,n_components):
  pca = PCA(n_components)
  pca.fit(scaled_X)
  return pca


# Model

In [9]:
def apply_model(modelname,xtrain,ytrain,xvalid,yvalid):
  if modelname=='XGBoost':
    score_train,f1score_val,model=apply_XGBoost(xtrain,ytrain,xvalid,yvalid)

  if modelname=='SVM':
    score_train,f1score_val,model=apply_svm(xtrain,ytrain,xvalid,yvalid)

  return score_train,f1score_val,model

In [10]:
def apply_XGBoost(xtrain,ytrain,xvalid,yvalid):
  xgb_model = XGBClassifier(learning_rate=0.3 , random_state =42, objective='multi:softmax', max_depth=6, reg_alpha = 0.08, gamma=0.1, verbosity=0) 
  xgb_model.fit(xtrain,ytrain)
  predictions = xgb_model.predict(xvalid)

  score_train = xgb_model.score(xtrain, ytrain)
  f1score_val = f1_score(yvalid, predictions, average="micro")
  print(f'Train score: {score_train} \t  valiation F1 score : {f1score_val}')
  return score_train,f1score_val,xgb_model



In [None]:
def apply_svm(xtrain,ytrain,xvalid,yvalid):
  pass

# Run

In [11]:
def run_experiment(path,modelname):
  # 1. raed data
  df= read_data(path=path)
  # 2. Upsampling
  oversampled_data=apply_Upsmote(df)
  # 3. Apply Kfold
  train_dfs ,valid_dfs= apply_stratifiedKFold(oversampled_data)

  # 4. do for each fold
  Accuracies_train,Accuracies_valid=list(),list()
  for train_df , valid_df in zip(train_dfs,valid_dfs):
    train_df= shuffle(train_df)  #1. shuffle
    scaler= apply_MinMaxScaler(train_df.iloc[:,:-1])  #2. scaling
    scaled_train= scaler.transform(train_df.iloc[:,:-1])
    pca= apply_pca(scaled_train ,n_components=7) #3. extract features
    pca_train = pca.transform(scaled_train)

    # Apply on validation
    scaled_valid=scaler.transform(valid_df.iloc[:,:-1])
    pca_valid=pca.transform(scaled_valid)

    xtrain,ytrain =pca_train,train_df.iloc[:,-1]
    xvalid,yvalid =pca_valid,valid_df.iloc[:,-1]
    
    score_train,f1score_val,model=apply_model(modelname, xtrain,ytrain,xvalid,yvalid) #4. Apply model
    Accuracies_train.append(score_train)
    Accuracies_valid.append(f1score_val)

  return np.mean(Accuracies_train), np.mean(Accuracies_valid),scaler,pca,model

In [None]:
path ='/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Original/train.csv'
avg_acc_train,avg_acc_valid,scaler,pca,model= run_experiment(path,modelname='XGBoost')
print(f'=========\nTotal Avg Acc Of Train = {avg_acc_train}  Total Avg Acc Of Valid = {avg_acc_valid}')

(10834, 17)
Train score: 1.0 	  valiation F1 score : 0.9395238095238095
Train score: 0.9998809523809524 	  valiation F1 score : 0.9464285714285714
Train score: 1.0 	  valiation F1 score : 0.9523809523809523
Train score: 0.9998809523809524 	  valiation F1 score : 0.96
Train score: 0.9998809523809524 	  valiation F1 score : 0.964047619047619
Total Avg Acc Of Train = 0.9999285714285715  Total Avg Acc Of Valid = 0.9524761904761905


In [13]:
def apply_XGBoost(xtrain,ytrain,xvalid,yvalid):
  xgb_model = XGBClassifier(learning_rate=0.2 , random_state =42, objective='multi:softmax', max_depth=4, reg_alpha = 0.09, gamma=0.2, verbosity=0) 
  xgb_model.fit(xtrain,ytrain)
  predictions = xgb_model.predict(xvalid)

  score_train = xgb_model.score(xtrain, ytrain)
  f1score_val = f1_score(yvalid, predictions, average="micro")
  print(f'Train score: {score_train} \t  valiation F1 score : {f1score_val}')
  return score_train,f1score_val,xgb_model


In [14]:
path ='/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Original/train.csv'
avg_acc_train,avg_acc_valid,scaler,pca,model= run_experiment(path,modelname='XGBoost')
print(f'=========\nTotal Avg Acc Of Train = {avg_acc_train}  Total Avg Acc Of Valid = {avg_acc_valid}')

(10834, 17)
Train score: 0.9809523809523809 	  valiation F1 score : 0.9395238095238095
Train score: 0.9808333333333333 	  valiation F1 score : 0.9435714285714286
Train score: 0.9807738095238095 	  valiation F1 score : 0.9483333333333334
Train score: 0.9776190476190476 	  valiation F1 score : 0.9595238095238096
Train score: 0.9785714285714285 	  valiation F1 score : 0.9592857142857143
Total Avg Acc Of Train = 0.9797499999999999  Total Avg Acc Of Valid = 0.950047619047619


In [19]:
def apply_XGBoost(xtrain,ytrain,xvalid,yvalid):
  xgb_model = XGBClassifier(learning_rate=0.05 , random_state =42, objective='multi:softmax', max_depth=5, reg_alpha = 0.09, gamma=0.2, verbosity=0) 
  xgb_model.fit(xtrain,ytrain)
  predictions = xgb_model.predict(xvalid)

  score_train = xgb_model.score(xtrain, ytrain)
  f1score_val = f1_score(yvalid, predictions, average="micro")
  print(f'Train score: {score_train} \t  valiation F1 score : {f1score_val}')
  return score_train,f1score_val,xgb_model


In [20]:
path ='/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Original/train.csv'
avg_acc_train,avg_acc_valid,scaler,pca,model= run_experiment(path,modelname='XGBoost')
print(f'=========\nTotal Avg Acc Of Train = {avg_acc_train}  Total Avg Acc Of Valid = {avg_acc_valid}')

(10834, 17)
Train score: 0.9664880952380952 	  valiation F1 score : 0.9342857142857143
Train score: 0.9643452380952381 	  valiation F1 score : 0.9419047619047618
Train score: 0.9636309523809524 	  valiation F1 score : 0.9507142857142857
Train score: 0.9620833333333333 	  valiation F1 score : 0.9580952380952381
Train score: 0.9619642857142857 	  valiation F1 score : 0.9554761904761905
Total Avg Acc Of Train = 0.9637023809523809  Total Avg Acc Of Valid = 0.9480952380952381


In [132]:
def apply_XGBoost(xtrain,ytrain,xvalid,yvalid):
  xgb_model = XGBClassifier(learning_rate=0.09 , random_state =42, objective='multi:softmax', max_depth=4, reg_alpha = 3, gamma=0, verbosity=0) 
  xgb_model.fit(xtrain,ytrain)
  predictions = xgb_model.predict(xvalid)

  score_train = xgb_model.score(xtrain, ytrain)
  f1score_val = f1_score(yvalid, predictions, average="micro")
  print(f'Train score: {score_train} \t  valiation F1 score : {f1score_val}')
  return score_train,f1score_val,xgb_model


In [133]:
path ='/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Original/train.csv'
avg_acc_train,avg_acc_valid,scaler,pca,model= run_experiment(path,modelname='XGBoost')
print(f'=========\nTotal Avg Acc Of Train = {avg_acc_train}  Total Avg Acc Of Valid = {avg_acc_valid}')

(10834, 17)
Train score: 0.9631547619047619 	  valiation F1 score : 0.934047619047619
Train score: 0.9627380952380953 	  valiation F1 score : 0.944047619047619
Train score: 0.9601785714285714 	  valiation F1 score : 0.950952380952381
Train score: 0.9595238095238096 	  valiation F1 score : 0.9564285714285714
Train score: 0.9588095238095238 	  valiation F1 score : 0.9576190476190476
Total Avg Acc Of Train = 0.9608809523809525  Total Avg Acc Of Valid = 0.9486190476190476


In [None]:
# save model 
with open('model.pkl', 'wb') as file:
        pickle.dump(model, file)

# Test Kaggle

In [None]:
path ='/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Original/test.csv'
df= pd.read_csv(path)
scaled_df= scaler.transform(df.iloc[:,1:])
ft_selected=pca.transform(scaled_df)

# Load pretrained model
pkl_filename='model.pkl'
with open(pkl_filename, 'rb') as file:
    pretrained_model = pickle.load(file)

prediction= pretrained_model.predict(ft_selected)
pred_df= pd.DataFrame(prediction)
submit_df =pd.concat([df.iloc[:,0],pred_df],axis=1)
submit_df.columns=['ID','y']
submit_df.replace({'y':{1:'BARBUNYA',2:'BOMBAY',3:'CALI',4:'DERMASON',5:'HOROZ',6:'SEKER',7:'SIRA'}},inplace=True)
submit_df.to_csv('submission.csv')
submit_df

Unnamed: 0,ID,y
0,10834,HOROZ
1,10835,DERMASON
2,10836,BARBUNYA
3,10837,DERMASON
4,10838,BOMBAY
...,...,...
2704,13538,CALI
2705,13539,SEKER
2706,13540,HOROZ
2707,13541,DERMASON
