In [None]:
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import itertools
%matplotlib inline
import plotly
import plotly.express as px
import plotly.graph_objs as go
import math
plt.style.use("seaborn-whitegrid")

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from mlxtend.classifier import StackingCVClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import r2_score, classification_report, confusion_matrix, roc_curve, auc, plot_confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_selection import mutual_info_classif
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV
from scipy import stats
from scipy.stats import skew
import warnings
warnings.filterwarnings("ignore")

### Data Fields:
`ID` - an ID for this instance.\
Area - `(A)`, The area of a bean zone and the number of pixels within its boundaries.\
Perimeter - `(P)`, Bean circumference is defined as the length of its border.\
MajorAxisLength - `(L)`, The distance between the ends of the longest line that can be drawn from a bean.\
MinorAxisLength - `(I)`, The longest line that can be drawn from the bean while standing perpendicular to the main axis.\
AspectRatio - `(K)`, Defines the relationship between L and l`(L/I)`.\
Eccentricity - `(Ec)`, Eccentricity of the ellipse having the same moments as the region.\
ConvexArea - `(C)`, Number of pixels in the smallest convex polygon that can contain the area of a bean seed.\
EquivDiameter - `(Ed)`, The diameter of a circle having the same area as a bean seed area `sqrt(4*A/pi)`.\
Extent -`(Ex)`, The ratio of the pixels in the bounding box to the bean area.\
Solidity - `(S)`, Also known as convexity. The ratio of the pixels in the convex shell to those found in beans `(A/c)`.\
Roundness - `(R)`, Calculated with the following formula: `(4*pi*A)/(P^2)`.\
Compactness - `(CO)`, Measures the roundness of an object: `(Ed/L)`.\
ShapeFactor1 - `(SF1=L/A)`.\
ShapeFactor2 - `(SF2=I/A)`.\
ShapeFactor3 - `(SF3=A/(pi*L/2*L/2))`.\
ShapeFactor4 - `(SF4=A/(pi*L/2*I/2))`.\
`y` - the class of the bean. It can be any of BARBUNYA, SIRA, HOROZ, DERMASON, CALI, BOMBAY, and SEKER.

# Loading and exploring the dataset

In [None]:
raw_data = pd.read_csv('train.csv')
raw_data

In [None]:
raw_data.info()

In [None]:
# Converting dtype of target from object to categorical for label encoding
raw_data.y = raw_data.y.astype('category')
raw_data.info()

In [None]:
raw_data.columns

In [None]:
raw_data.isnull().sum()

In [None]:
raw_data.duplicated().sum()

In [None]:
raw_data.describe().T

In [None]:
raw_data['y'].value_counts()

## Data visualization

In [None]:
plt.figure(figsize=(25, 25))
for i, col in enumerate(list(raw_data.columns)):
    plt.subplot(7, 4, i+1)
    sns.histplot(raw_data[col], kde=True, bins=10)

In [None]:
def plot_subplots(subplots, plot, n=3):
    m = len(subplots)
    height = (m//n + (m%n != 0)) * 4
    plt.figure(figsize=(14, height))
    for i, c in enumerate(subplots):
        plt.subplot(m//n + min(1, m%n), n, i+1)
        plot(c)
        plt.tight_layout(pad=2.0)
        plt.xticks(rotation=45)
        
num_cols = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
           'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent',
           'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
           'ShapeFactor3', 'ShapeFactor4']

plot_subplots(num_cols, lambda c: sns.barplot(data=raw_data, x='y', alpha=0.1, y=c))

In [None]:
Strongly_corr_features = raw_data[["Area","Perimeter","AspectRation","Eccentricity","roundness","Compactness","y"]]
Strongly_corr_features.head()
sns.set_theme(style="whitegrid")
sns.pairplot(Strongly_corr_features, hue="y")

In [None]:
visualization_df=raw_data.drop(['ID'], axis=1)
i = 1
plt.figure(figsize = [15, 15], tight_layout = 5)
for column in visualization_df.drop(['y'], axis=1).columns:
    plt.subplot(6, 3, i)
    plt.scatter(data = visualization_df, x = column, y = 'y', c='c', edgecolors='black')
    plt.xlabel(column)
    plt.ylabel('Beans Classes')
    plt.title(column + ' VS ' + 'Beans Classes')
    i += 1
plt.show()

In [None]:
raw_data.y.mode()

In [None]:
# from pandas.plotting import scatter_matrix

# scatter_matrix(raw_data.drop(['ID'], axis=1), figsize=(25, 25), grid=True)
# plt.show()

In [None]:
corr = raw_data.corr()
f,axes = plt.subplots(1,1,figsize = (20,15))
sns.heatmap(corr, square=True, annot = True, linewidth = .5, center = 2, ax = axes, cmap='Blues')

## Feature Engineering:

In [None]:
data = raw_data.copy()
data.head(10)

In [None]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

# Label encoding for categoricals
for colname in data.select_dtypes("object"):
    data[colname], _ = data[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = data.dtypes == int

mi_scores = make_mi_scores(data.drop(columns=['ID', 'y']), data.y, discrete_features=False)
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

In [None]:
#Calc the skeweness of each continous feature

def calc_skew(df):
    print("\nIF THE DATA IS HIGHLY SKEWED IF SKWENESS  > 1 OR < -1 \n")
    for col in df.loc[:, df.dtypes != np.object ]:
        print("the skewness of ",col,"is :",df[col].skew())

calc_skew(data.drop(['ID','y'], axis=1))

## Data Preparation

In [None]:
def normalize_data(df,column):
    return StandardScaler().fit_transform(np.array(df[column]).reshape(-1,1))

def removeOutliers(df,outliersColomns):
    z_scores = stats. zscore(df[outliersColomns]) 
    abs_z_scores = np. abs(z_scores)
    filtered_entries = (abs_z_scores < 3). all(axis=1)
    new_df = df[filtered_entries]
    return new_df

y_labels = ['DERMASON', 'SIRA', 'SEKER', 'HOROZ', 'CALI', 'BARBUNYA', 'BOMBAY']
def removeOutliers_cat(df_raw,outliersColomns):
    df = df_raw.copy()
    for cat_ in y_labels:
        df_ = df[df['y']==cat_]
        z_scores = stats. zscore(df_[outliersColomns]) 
        abs_z_scores = np. abs(z_scores)
        filtered_entries = (abs_z_scores < 4). all(axis=1)
        df_ = df_[filtered_entries]
        df = df[df['y']!=cat_]
        df = df.append(df_)
    return df

# Transform data
def transformation(df,columns,func):
    for col in columns:
        df[col]=func(df[col])
    return df

def encodingTarget(dataset, cols):
    for col_name in cols:
        dataset[col_name] = dataset[col_name].replace({'DERMASON' :0, 'SIRA':1, 'SEKER':2, 'HOROZ':3, 'CALI':4, 'BARBUNYA':5, 'BOMBAY':6})
    return dataset

def decodingTarget(dataset, cols):
    for col_name in cols:
        dataset[col_name] = dataset[col_name].replace({0:'DERMASON' ,1: 'SIRA', 2:'SEKER', 3:'HOROZ', 4:'CALI',5: 'BARBUNYA', 6:'BOMBAY'})
    return dataset

In [None]:
data = raw_data.copy()

def prepare_data(df, isTest=False):
    
    outliersColomns = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
                       'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent',
                       'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
                       'ShapeFactor3', 'ShapeFactor4']
    if isTest == False:
#         pass
#         df=df[df['y']!='BOMBAY']
        before_ = len(df)
        # df = removeOutliers_cat(df,outliersColomns)
        print('Removed ',before_ - len(df),' outliers')
#         df = encodingTarget(df,['y'])
        
    # take log1p for right skewed and square for left skewedfeatures.
#     right_skew_features = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'EquivDiameter', 'ConvexArea']
#     left_skew_features = ['Eccentricity', 'Solidity', 'roundness', 'ShapeFactor4', 'Extent']
#     transformation(data, right_skew_features, np.log1p)
#     transformation(data, left_skew_features, np.square)
    
    df['ShapeFactor5'] = df['MajorAxisLength'] / df['Perimeter']
    df['ShapeFactor6'] = df['MinorAxisLength'] / df['Perimeter']
    df['ShapeFactor7'] = df['Eccentricity'] * df['Area']
    df['ShapeFactor8'] = df['Eccentricity'] * df['Perimeter']
    df['ShapeFactor9'] = df['Extent'] * df['Area']
    df['ShapeFactor10'] = df['Extent'] * df['Perimeter']
    
    NonScaledFeatures=['ID', 'y','Index']
    for col_name in df.columns:
        if col_name not in NonScaledFeatures:
            scaled_col = col_name + '_r'
            df[col_name] = df[col_name].fillna(df[col_name].mean())  
            df[scaled_col]= df[col_name]
            df[col_name]=normalize_data(df,col_name)
            #make all values positive
#             min_ = abs(df[col_name].min())
#             df[col_name]= df[col_name] + min_
            
    return df
  
data = prepare_data(data)
# labelencoder
labelencoder = LabelEncoder()
data['y'] = labelencoder.fit_transform(data['y'])
        

In [None]:
data.describe()

In [None]:
data

In [None]:
data.columns

In [None]:
# print("Data after preprocessing: ")
# plt.figure(figsize=(24, 25))
# for i, col in enumerate(list(data.columns)):
#     plt.subplot(10, 4, i+1)
#     sns.histplot(data[col], kde=True, bins=10, color='grey')

In [None]:
visualization_df=data.drop(['ID','Area_r', 'Perimeter_r', 'MajorAxisLength_r', 'MinorAxisLength_r',
       'AspectRation_r', 'Eccentricity_r', 'ConvexArea_r', 'EquivDiameter_r',
       'Extent_r', 'Solidity_r', 'roundness_r', 'Compactness_r',
       'ShapeFactor1_r', 'ShapeFactor2_r', 'ShapeFactor3_r', 'ShapeFactor4_r',
       'ShapeFactor5_r', 'ShapeFactor6_r', 'ShapeFactor7_r', 'ShapeFactor8_r',
       'ShapeFactor9_r', 'ShapeFactor10_r'], axis=1)
i = 1
plt.figure(figsize = [20, 20], tight_layout = 5)
for column in visualization_df.drop(['y'], axis=1).columns:
    plt.subplot(5, 5, i)
    plt.scatter(data = visualization_df, x = column, y = 'y', c='c', edgecolors='red')
    plt.xlabel(column)
    plt.ylabel('Beans Classes')
    plt.title(column + ' VS ' + 'Beans Classes')
    i += 1
plt.show()

In [None]:
# # Define the lambda function: categorize_label
# categorize_label = lambda x: x.astype('category')

# # Convert df[LABELS] to a categorical type
# df[LABELS] = df[LABELS].apply(categorize_label, axis=0)

# # Print the converted dtypes
# print(df[LABELS].dtypes)

# # Calculate number of unique values for each label: num_unique_labels
# num_unique_labels = data['y'].apply(pd.Series.nunique)

# # Plot number of unique values for each label
# num_unique_labels.plot(kind='bar')

# # Label the axes
# plt.xlabel('Labels')
# plt.ylabel('Number of unique values')

# # Display the plot
# plt.show()

## Features-labels split and train-validation split

In [None]:
train_df, val_df = train_test_split(data, test_size=0.20, random_state=42, shuffle=True, stratify=data.y) 

x_train = train_df.drop(columns=['ID','y'])
y_train = train_df['y']

x_val = val_df.drop(columns=['ID','y'])
y_val = val_df['y']

In [None]:
x_train.shape[0], x_val.shape[0]

In [None]:
# Train with ALL DATA to increase accuracy after it has been validated.

x_train = data.drop(columns=['ID','y'])
y_train = data['y']

In [None]:
corr1= x_train.corr()
f,axes = plt.subplots(1,1,figsize = (30,20))
sns.heatmap(corr1, square=True, annot = True, linewidth = .5, center = 2, ax = axes, cmap='Blues')

In [None]:
corr1.head()

In [None]:
#dropped high correlated redundant features: 'ConvexArea', 'Compactness'

model_columns = [
        'MajorAxisLength', 'MinorAxisLength',
       'AspectRation', 'Extent',
       'Solidity', 'roundness','ShapeFactor4']

In [None]:

y_labels_Post = ['DERMASON', 'SIRA', 'SEKER', 'HOROZ', 'CALI', 'BARBUNYA', 'BOMBAY']
# y_labels = labelencoder.transform(y_labels_Post)
print(y_labels_Post)
def ModelPredictionsPostProcessing(predictions_t,train_x,param=[0.00397101, 0.008575, 635.011, 30917.0, 0.539964, 0.838197, 0.966603, 0.9637, 2.01, 126500.0, 0.49473, 0.70337, 66105.9, 1155.934, 237.238412, 454.231985, 305.811572, 75191.0, 0.005591039999999999, 0.815341, 0.515836, 0.817632, 0.387031]):
  #This function do preprossing on the model predictions to decrease the errors of the model
    predictions_ = predictions_t.copy()
    for i in range(len(predictions_)):
        if (train_x['ShapeFactor1_r'].array[i] < param[0]) and (train_x['Area_r'].array[i] >param[9]):#param1,param10
            predictions_[i] = y_labels_Post[6]#'BOMBAY'
        elif  (train_x['ShapeFactor1_r'].array[i] > param[1]) or (train_x['Perimeter_r'].array[i] < param[2]) or (train_x['Area_r'].array[i] <param[3]):#param2, param3, param4
            predictions_[i] = y_labels_Post[0]#'DERMASON'
        else:
            if (train_x['Solidity_r'].array[i] >  0.9928774928774928 ) and predictions_[i] == y_labels_Post[ 0 ] :
                predictions_[i] = y_labels_Post[ 2 ]
            if (train_x['Extent_r'].array[i] <  0.7044686791405081 ) and predictions_[i] == y_labels_Post[ 2 ] :
                    predictions_[i] = y_labels_Post[ 1 ]
            if (train_x['Extent_r'].array[i] <  0.6469956019402373 ) and predictions_[i] == y_labels_Post[ 1 ] :
                predictions_[i] = y_labels_Post[ 3 ]
            if (train_x['roundness_r'].array[i] <  0.7959889783638006 ) and predictions_[i] == y_labels_Post[ 1 ] :
                predictions_[i] = y_labels_Post[ 5 ]
            if (train_x['Area_r'].array[i] <  55938 ) and predictions_[i] == y_labels_Post[ 4 ] :
                predictions_[i] = y_labels_Post[ 3 ]
            if (train_x['Extent_r'].array[i] <  0.6475477522670268 ) and predictions_[i] == y_labels_Post[ 5 ] :
                predictions_[i] = y_labels_Post[ 3 ]
            if (train_x['AspectRation_r'].array[i] <  1.4973590208757783 ) and predictions_[i] == y_labels_Post[ 4 ] :
                predictions_[i] = y_labels_Post[ 5 ]
            if (train_x['ShapeFactor4_r'].array[i] >  0.9989583842744816 ) and predictions_[i] == y_labels_Post[ 4 ] :
                predictions_[i] = y_labels_Post[ 5 ]
            if (train_x['ShapeFactor5_r'].array[i] >  0.387031233685577 ) and predictions_[i] == y_labels_Post[ 5 ] :
                predictions_[i] = y_labels_Post[ 3 ]
            if (train_x['ShapeFactor4_r'].array[i] <  0.9840169199256958 ) and predictions_[i] == y_labels_Post[ 1 ] :
                predictions_[i] = y_labels_Post[ 3 ]
            if (train_x['ShapeFactor5_r'].array[i] <  0.32719460392194527 ) and predictions_[i] == y_labels_Post[ 0 ] :
                predictions_[i] = y_labels_Post[ 2 ]
            if (train_x['Area_r'].array[i] >  57790 ) and predictions_[i] == y_labels_Post[ 1 ] :
                predictions_[i] = y_labels_Post[ 5 ]
            if (train_x['ShapeFactor4_r'].array[i] <  0.9840169199256958 ) and predictions_[i] == y_labels_Post[ 1 ] :
                predictions_[i] = y_labels_Post[ 3 ]
            if (train_x['Area_r'].array[i] >  57790 ) and predictions_[i] == y_labels_Post[ 1 ] :
                predictions_[i] = y_labels_Post[ 5 ]
            if (train_x['ShapeFactor4_r'].array[i] <  0.9840169199256958 ) and predictions_[i] == y_labels_Post[ 1 ] :
                predictions_[i] = y_labels_Post[ 3 ]
            if (train_x['ShapeFactor9_r'].array[i] >  43651.16685252113 ) and predictions_[i] == y_labels_Post[ 1 ] :
                predictions_[i] = y_labels_Post[ 2 ]
            if (train_x['Area_r'].array[i] <  55938 ) and predictions_[i] == y_labels_Post[ 4 ] :
                predictions_[i] = y_labels_Post[ 3 ]
            if (train_x['MinorAxisLength_r'].array[i] >  231.34609602337528 ) and predictions_[i] == y_labels_Post[ 3 ] :
                predictions_[i] = y_labels_Post[ 4 ]
            if (train_x['Perimeter_r'].array[i] >  881.1379999999998 ) and predictions_[i] == y_labels_Post[ 2 ] :
                predictions_[i] = y_labels_Post[ 5 ]

    return predictions_

## Training different classification models on the dataset


In [None]:
# pip install lazypredict

In [None]:
# #  Lazy Predicror for best model:

# from lazypredict.Supervised import LazyClassifier
# from sklearn.utils import shuffle
# # from sklearn import datasets

# features = data.drop(columns=['ID','y'])
# target = data['y']

# X, Y = shuffle(features, target, random_state=42)
# X = X.astype(np.float32)

# offset = int(X.shape[0] * 0.9)

# X_train, Y_train = X[:offset], Y[:offset]
# X_test, y_test = X[offset:], Y[offset:]

# cls_lazy = LazyClassifier(predictions=False)
# models, predictions = cls_lazy.fit(X_train, X_test, Y_train, y_test)

# print(models)

### 1- GradientBoostingClassifier:

In [None]:
# train with Gradient Boosting algorithm
# compute the accuracy scores on train and validation sets when training with different learning rates

# learning_rates = [0.01, 0.03, 0.05, 0.07, 0.1]
# for learning_rate in learning_rates:
#     gb = GradientBoostingClassifier(n_estimators=600, learning_rate = learning_rate,  max_depth = 5, random_state = 42)
#     gb.fit(x_train, y_train)
#     print("Learning rate: ", learning_rate)
#     print("Accuracy score (training): {0:.3f}".format(gb.score(x_train, y_train)))
#     print("Accuracy score (validation): {0:.3f}".format(gb.score(x_val, y_val)))
#     print()
# 0.05 is best , 1.0 on training and 0.938 on validation

In [None]:
# # Gradient Boosting algorithm 

# gb_model = GradientBoostingClassifier( n_estimators=600, learning_rate=0.07, max_depth=5, random_state=42)
# # param_grid = dict(n_estimators=[600], learning_rate=[0.05], max_depth=[5], random_state=[42])
# # kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# # grid_search = GridSearchCV(gb, param_grid, scoring="accuracy", cv=kfold, n_jobs=-1, verbose=False)#, return_train_score=True)

# gb_model = gb_model.fit(x_train[model_columns], y_train)
# predictions_GB = gb_model.predict(x_val[model_columns])

# print("Mean_F1_score: ", f1_score(y_val, predictions_GB, average='micro'))

# print("Classification Report: \n", classification_report(y_val, predictions_GB))

### 2- XGB Classifier:

In [None]:
# #XGB Classifier:

xgb_model = XGBClassifier(learning_rate=0.07, random_state =42, objective='multi:softmax', max_depth=5, reg_alpha = 0.002, gamma=0.01, verbosity=0) 
 
xgb_model.fit(x_train[model_columns], y_train)
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgb_model, x_train, y_train, cv=kfold )
predictions_XGB = xgb_model.predict(x_val[model_columns])
print('Mean_F1_score', f1_score(y_val, predictions_XGB, average='micro'))
print("Classification Report: \n", classification_report(y_val, predictions_XGB))

### 3- LightGB Classifier:

In [None]:
# build the lightgbm model

lgb_model = LGBMClassifier(objective='multiclass', random_state=42, learning_rate=0.03, reg_alpha=0.0001)
lgb_model.fit(x_train[model_columns], y_train)
predictions_LGB = lgb_model.predict(x_val[model_columns])
print('Mean_F1_score', f1_score(y_val, predictions_LGB, average='micro'))
print("Classification Report: \n", classification_report(y_val, predictions_LGB))

### 4- BaggingClassifier:

In [None]:
# bag_model = BaggingClassifier(random_state=42, n_estimators=30, oob_score=True) 
# bag_model = bag_model.fit(x_train[model_columns],y_train)

# predictions_BAG = bag_model.predict(x_val[model_columns])
# print('Mean_F1_score', f1_score(y_val, predictions_BAG, average='micro'))
# print("Classification Report: \n", classification_report(y_val, predictions_BAG))

### 5- MultiLayerPerceptronClassifier:

In [None]:
# Create an instance of the MLPclassifier

mlp_model = MLPClassifier(solver='adam', activation='logistic', alpha=1e-4, random_state=42, max_iter=1000, early_stopping=True, validation_fraction=0.2, warm_start=True, verbose=False, learning_rate ='adaptive', learning_rate_init=0.01)
mlp_model = mlp_model.fit(x_train[model_columns], y_train)

predictions_MLP = mlp_model.predict(x_val[model_columns])
print('Mean_F1_score', f1_score(y_val, predictions_MLP, average='micro'))
# print("The accuracy of the classifier on the validation set is ", (mlp_model.score(x_val[model_columns], y_val)))
print("Classification Report")
print(classification_report(y_val, predictions_MLP))

### 6- KNeighborsClassifier:

In [None]:
kn_model = KNeighborsClassifier(algorithm='auto', weights ='distance', n_neighbors=15)#=10
kn_model.fit(x_train[model_columns], y_train)
          
predictions_KN = kn_model.predict(x_val[model_columns])
print('Mean_F1_score', f1_score(y_val, predictions_KN, average='micro'))
print("Classification Report: \n", classification_report(y_val, predictions_KN))

### 7- SupportVectorClassifier:

In [None]:
svc_model = SVC(C=1.0, kernel='rbf', max_iter=-1, random_state=42, decision_function_shape='ovo', gamma=0.20)
# svc_model = SVC(C=1.0, kernel='poly', degree=3, max_iter=-1, random_state=42, decision_function_shape='ovo', gamma=0.20)
svc_model.fit(x_train[model_columns], y_train)

predictions_SVC = svc_model.predict(x_val[model_columns])
print('Mean_F1_score', f1_score(y_val, predictions_SVC, average='micro'))
print("Classification Report: \n", classification_report(y_val, predictions_SVC))

### 8- DecisionTreeClassifier:

In [None]:
# dt_model = DecisionTreeClassifier(max_depth=5, min_samples_split=16, ccp_alpha=0.00001, random_state=42, criterion='gini')
# dt_model.fit(x_train[model_columns], y_train)

# predictions_DT = dt_model.predict(x_val[model_columns])
# print('Mean_F1_score', f1_score(y_val, predictions_DT, average='micro'))

# print("Classification Report: \n", classification_report(y_val, predictions_DT))

### 9- RandomForestClassifier:

In [None]:
# rf_model = RandomForestClassifier(n_estimators=1000, ccp_alpha=0.0001, criterion='entropy', max_depth=7, n_jobs=-1, random_state=42)
# rf_model.fit(x_train[model_columns], y_train)

# predictions_rf = rf_model.predict(x_val[model_columns])
# print('Mean_F1_score', f1_score(y_val, predictions_rf, average='micro'))
# print("Classification Report: \n", classification_report(y_val, predictions_rf))

### 10- StochasticGradiantDescentClassifier:

In [None]:

# sgdc = SGDClassifier(random_state=42, learning_rate='adaptive', eta0=0.1, max_iter=1000, shuffle=True, verbose=0, loss='modified_huber')
# sgdc.fit(x_train[model_columns], y_train)

# predictions_sgdc = sgdc.predict(x_val[model_columns])
# print('Mean_F1_score', f1_score(y_val, predictions_sgdc, average='micro'))
# print("Classification Report: \n", classification_report(y_val, predictions_sgdc))

### 11- Naive Bayes Model:

In [None]:
# from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, CategoricalNB, ComplementNB

# nb_model = GaussianNB(priors=None, var_smoothing=1e-09)
# nb_model.partial_fit(x_train[model_columns], y_train, np.unique(y_train))

# predictions_NB = nb_model.predict(x_val[model_columns])
# print('Mean_F1_score', f1_score(y_val, predictions_NB, average='micro'))
# print("Classification Report: \n", classification_report(y_val, predictions_NB))

#### Ensemble method:

In [None]:
#voting

classifiers = [
                ('SVC:', svc_model),
#                ('MLP:', mlp_model),
                ('LGB:', lgb_model),
#                 ('GB:', gb_model),
                 ('XGB:', xgb_model),
#                 ('BAG:', bag_model),
                ('KN:', kn_model),
#                 ('DT:', dt_model),
#                 ('NB:', nb_model),
                ]
    
for clf_name, clf in classifiers:
#     clf.fit(x_train[model_columns], y_train)
    y_pred = clf.predict(x_val[model_columns])
    print(clf_name, f1_score(y_val, y_pred, average='micro'))

vc = VotingClassifier(estimators=classifiers)
# kfold = StratifiedKFold(n_splits=10, random_state=42)
# cv_results = cross_val_score(vc, x_train[model_columns], y_train, cv=kfold, scoring='f1_micro')
vc.fit(x_train[model_columns], y_train)
y_pred_voting = vc.predict(x_val[model_columns])
print()
print('Voting Classifier: ',f1_score(y_val, y_pred_voting, average='micro'))

In [None]:
# predictions_output = ModelPredictionsPostProcessing(y_pred_voting, x_val)
# print('Voting Classifier PostProcessing: ',accuracy_score(y_val, predictions_output))

In [None]:
#stacking

# sclf = StackingCVClassifier(classifiers=[gb_model, kn_model, svc_model, mlp_model, lgb_model, xgb_model, bag_model], meta_classifier=gb_model)

# print('10-fold cross validation Stacking Classification:\n')
# for clf, label in zip([gb_model, kn_model, svc_model, mlp_model, lgb_model, xgb_model, bag_model], 
#                       ['GB:','KN:', 'SVC:', 'MLP:', 'LGB:', 'XGB:', 'BAG:']):
#     sclf_scores = cross_val_score(clf, x_train[model_columns], y_train, cv=10, scoring='accuracy')
#     print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (sclf_scores.mean(), sclf_scores.std(), label))

# sclf.fit(x_train[model_columns], y_train)
# y_pred_stacking = sclf.predict(x_val[model_columns])
# # Evaluate the test-set accuracy of 'vc'
# print('Stacking Classifier: ',accuracy_score(y_val, y_pred_stacking))

In [None]:
# print("Confusion Matrix:")

# y_labels = ['DERMASON', 'SIRA', 'SEKER', 'HOROZ', 'CALI', 'BARBUNYA', 'BOMBAY']
# class_names= ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']
# lgb = LGBMClassifier()
# lgb.fit(x_train[model_columns], y_train)
# np.set_printoptions(precision=5)
# plt.rcParams.update({'font.size': 15})
# fig, ax = plt.subplots(figsize=(10, 10),)
# plot_confusion_matrix(lgb, x_val[model_columns], y_val, display_labels=class_names, xticks_rotation='vertical', cmap=plt.cm.Blues, ax=ax)
# plt.show()

## Reading the test file

In [None]:
raw_test = pd.read_csv('test.csv')
raw_test.sample(10)

In [None]:
raw_test.isnull().sum()

In [None]:
X_test = raw_test.copy()
# X_test['Index']= 0
# for i_ in range(len(X_test)):
#     X_test['Index'].array[i_] =  i_
# X_test_BOMBAY = X_test[ (X_test['ShapeFactor1']< 0.00397101) & (X_test['Area']> 0.008575) ] 
# X_test = X_test[ ((X_test['ShapeFactor1']>= 0.00397101) | (X_test['Area']<= 0.008575)) ] 
X_test = prepare_data(X_test, isTest=True)
# X_test_BOMBAY = prepare_data(X_test_BOMBAY, isTest=True)
# X_test = X_test.append(X_test_BOMBAY)
# X_test = X_test.sort_values(by="Index",ascending=True)

X_test = X_test.drop(columns=['ID'])

y_test_predicted_vc = vc.predict(X_test[model_columns])
y_test_predicted_vc = labelencoder.inverse_transform(y_test_predicted_vc)

predictions_output = ModelPredictionsPostProcessing(y_test_predicted_vc, X_test)
# raw_test['y'] = y_test_predicted_vc
raw_test['y'] = predictions_output
# X_test['y'] = predictions_output
# raw_test = decodingTarget(raw_test,['y'])
raw_test

In [None]:
raw_test[['ID', 'y']].to_csv('/kaggle/working/submission.csv', index=False)
