In [18]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import requests

# Haralick
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('abalone_dataset.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3132 entries, 0 to 3131
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             3132 non-null   object 
 1   length          3132 non-null   float64
 2   diameter        3132 non-null   float64
 3   height          3132 non-null   float64
 4   whole_weight    3132 non-null   float64
 5   shucked_weight  3132 non-null   float64
 6   viscera_weight  3132 non-null   float64
 7   shell_weight    3132 non-null   float64
 8   type            3132 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 220.3+ KB


In [5]:
df.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,type
count,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0
mean,0.521392,0.405865,0.138263,0.818738,0.355398,0.178349,0.235616,1.991379
std,0.120756,0.0996,0.039206,0.48956,0.221473,0.109554,0.139215,0.824561
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.345,0.11,0.436375,0.1815,0.090875,0.1275,1.0
50%,0.54,0.42,0.14,0.787,0.3305,0.168,0.225,2.0
75%,0.61,0.48,0.165,1.141625,0.4975,0.250125,0.323625,3.0
max,0.815,0.65,0.515,2.8255,1.488,0.76,1.005,3.0


In [16]:
feature_cols = ['length', 'diameter', 'height', 'whole_weight',
                'shucked_weight', 'viscera_weight', 'shell_weight']

X = df[feature_cols]
y = df['type']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## K-Nearest Neighbors

In [21]:
test_scores = []
train_scores = []

for i in range(1,15):

    knn = KNeighborsClassifier(i)
    knn.fit(X_train,y_train)

    train_scores.append(knn.score(X_train,y_train))
    test_scores.append(knn.score(X_test,y_test))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_scores,marker='o',label='Test Score')


In [23]:
knn = KNeighborsClassifier(n_neighbors=12)

knn.fit(X_train,y_train)

preds = knn.predict(X_test)

## Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier

best_score = 0
for criterion in ['gini', 'entropy']:
    for max_depth in range(1, 16):
        for max_features in ['auto', 'sqrt', 'log2']:
            rf = RandomForestClassifier(criterion=criterion,
                                        max_depth=max_depth,
                                        max_features=max_features)
            rf.fit(X_train, y_train)
            score = rf.score(X_test, y_test)

            if score > best_score:
                best_score = score
                best_parameters = {
                    'criterion': criterion,
                    'max_depth': max_depth,
                    'max_features': max_features
                }

print(best_score)
print(best_parameters)

0.6832694763729247
{'criterion': 'entropy', 'max_depth': 7, 'max_features': 'auto'}


In [56]:
# rf = RandomForestClassifier()
rf = RandomForestClassifier(
    criterion=best_parameters['criterion'],
    max_depth=best_parameters['max_depth'],
    max_features=best_parameters['max_features']
)
rf.fit(X_train, y_train)

preds = rf.predict(X_test)

## SVM

In [61]:
from sklearn.svm import SVC

best_score = 0
for C in [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    for gamma in ['auto', 'scale']:
        for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
            svc = SVC(kernel=kernel, C=C, gamma=gamma)
            svc.fit(X_train, y_train)
            score = svc.score(X_test, y_test)

            if score > best_score:
                    best_score = score
                    best_parameters = {
                        'C': C,
                        'gamma': gamma,
                        'kernel': kernel
                    }
print(best_score)
print(best_parameters)

0.6883780332056194
{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}


In [67]:
# svc = SVC()
svc = SVC(kernel=best_parameters['kernel'],
          C=best_parameters['C'],
          gamma=best_parameters['gamma'])
svc.fit(X_train, y_train)

preds = svc.predict(X_test)

## Accuracy Scores

In [68]:
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average="micro")
recall = recall_score(y_test, preds, average="micro")
f1 = f1_score(y_test, preds, average='micro')

print(f'Accuracy:  {accuracy}')
print(f'Precision: {precision}')
print(f'Recall:    {recall}')
print(f'F1:        {f1}')

Accuracy:  0.665389527458493
Precision: 0.665389527458493
Recall:    0.665389527458493
F1:        0.665389527458493


## Predict and Upload

In [64]:
data = pd.read_csv('abalone_app.csv')[feature_cols]
y_pred = rf.predict(data)

In [65]:
URL = "https://aydanomachado.com/mlclass/03_Validation.php"

DEV_KEY = "Computatchan"

data = {'dev_key': DEV_KEY,
        'predictions': pd.Series(y_pred).to_json(orient='values')}

In [66]:
# Enviando requisição e salvando o objeto resposta
r = requests.post(url=URL, data=data)

# Extraindo e imprimindo o texto da resposta
pastebin_url = r.text
print(" - Resposta do servidor:\n", r.text, "\n")


# Resultados oficiais:
# 1:
#   Acurácia: 0.6258373205741626
#   Método: KNN com 12 vizinhos e sem pré-processamento

# Resultados não-oficiais:
# 1:
#   Acurácia: 0.6258373205741626
#   Método: KNN com 12 vizinhos e sem pré-processamento
# 2:
#   Acurácia: 0.6577266922094508
#   Método: Random Forest com parâmetros default e sem pré-processamento
# 3:
#   Acurácia: 0.665389527458493
#   Método: Random Forest com criterion default, max_depth=7 e max_features='sqrt'; sem pré-processamento
# 4:
#   Acurácia: 0.6743295019157088
#   Método: Random Forest com criterion='entropy', max_depth=7 e max_features='auto'; sem pré-processamento
# 5:
#   Acurácia: 0.6883780332056194
#   Método: SVM com C=100, gamma='scale' e kernel='rbf'; sem pré-processamento
# 6:
#   Acurácia: 0.665389527458493
#   Método: SVM com parâmetros default e sem pré-processamento

 - Resposta do servidor:
 {"error":{"code":102,"message":"Espere ao menos 12 horas entre dois envios, tempo restante 00 dias 08 horas 58 minutos 45 segundos"}} 

