# Dataset Preparation


## Data Collection


In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
import warnings

warnings.filterwarnings('ignore')

path = './exercises/sklearn/rice-classification/dataset/Rice_Cammeo_Osmancik.arff'

data = pd.DataFrame(arff.loadarff(path)[0])

data.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,15231.0,525.578979,229.749878,85.093788,0.928882,15617.0,0.572896,b'Cammeo'
1,14656.0,494.311005,206.020065,91.730972,0.895405,15072.0,0.615436,b'Cammeo'
2,14634.0,501.122009,214.106781,87.768288,0.912118,14954.0,0.693259,b'Cammeo'
3,13176.0,458.342987,193.337387,87.448395,0.891861,13368.0,0.640669,b'Cammeo'
4,14688.0,507.166992,211.743378,89.312454,0.906691,15262.0,0.646024,b'Cammeo'


## Handling missing values


In [27]:
# Enconding the last column as UTF-8 instead of bytes
data['Class'] = data['Class'].str.decode('utf-8')
data.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,15231.0,525.578979,229.749878,85.093788,0.928882,15617.0,0.572896,Cammeo
1,14656.0,494.311005,206.020065,91.730972,0.895405,15072.0,0.615436,Cammeo
2,14634.0,501.122009,214.106781,87.768288,0.912118,14954.0,0.693259,Cammeo
3,13176.0,458.342987,193.337387,87.448395,0.891861,13368.0,0.640669,Cammeo
4,14688.0,507.166992,211.743378,89.312454,0.906691,15262.0,0.646024,Cammeo


## Encoding categorical features with One-Hot Encoding


## Encoding binary class label


In [28]:
data.Class = data.Class.map(lambda x: 1 if x == 'Cammeo' else -1)

# Model Building


In [29]:
from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.tree import *

## Splitting the dataset into training and testing sets


In [30]:
x = data.drop('Class', axis=1)
y = data.Class
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

## Evaluation function


## Model Building


In [33]:
models = {
  'LogisticRegression': LogisticRegression(),
  'RandomForest': RandomForestClassifier(),
  'KNeighbors': KNeighborsClassifier(),
  'Perceptron': Perceptron(),
  'SVC': SVC(),
  'DecisionTree': DecisionTreeClassifier()
}

evaluators = {
  'accuracy': accuracy_score,
  'precision': precision_score,
  'recall': recall_score,
  'f1': f1_score
}

evaluations = {}

for model_name, model in models.items():
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
  evaluations[model_name] = {}
  for evaluator_name, evaluator in evaluators.items():
    evaluations[model_name][evaluator_name] = evaluator(y_test, y_pred)

evaluations = pd.DataFrame(evaluations).transpose().sort_values('accuracy', ascending=False)

Unnamed: 0,accuracy,precision,recall,f1
LogisticRegression,0.931759,0.941767,0.905405,0.923228
RandomForest,0.924759,0.939024,0.891892,0.914851
DecisionTree,0.88364,0.896907,0.839768,0.867398
SVC,0.88189,0.917211,0.812741,0.861822
KNeighbors,0.877515,0.903846,0.816602,0.858012
Perceptron,0.76203,0.855491,0.571429,0.685185


## Model Evaluation


## Model Building with Hyperparameter Tuning



## Model Evaluation


# Comparing Different Models Performance