# 8. SGD

In [19]:
def batch_iterate(x, y, batch_size):
    """Iterator dzielący dane na mini-batche"""
    assert len(x) == len(y)
    dataset_size = len(x)
    current_index = 0
    while current_index < dataset_size:
        x_batch = x[current_index : current_index + batch_size]
        y_batch = y[current_index : current_index + batch_size]
        yield x_batch, y_batch
        current_index += batch_size

In [102]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
model = SGDClassifier()

df = pd.read_csv("mushrooms.tsv", sep="\t", header=None).dropna()
df = df[df[12] != "?"] # Drop rows with weird char "?"
df[0] = df[0].apply(lambda x: 0 if x == "e" else 1) # change string values to numeric

# Get dummies values instead of chars
df_dummies = pd.get_dummies(df, columns=df.columns[1:])
X, y = df_dummies[1:], df_dummies[0][1:]

# Train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train = y_train.to_numpy().ravel()
y_exp = y_test.to_numpy().ravel()
X_train_scaled = scaler.fit_transform(X_train)

# Mini-batch SGD
batch_iterator = batch_iterate(X_train_scaled, y_train, batch_size=100)
for x_batch, y_batch in batch_iterator:
    model.partial_fit(x_batch, y_batch, classes=np.unique(y_batch)) 

y_pred = model.predict(X_test)
 
# Calculate root mean square error and F-score
error = mean_squared_error(y_exp, y_pred, squared=False)
score = f1_score(y_exp, y_pred)
print(f"RSME: {error}")
print(f"F-score: {score}")

RSME: 0.0
F-score: 1.0
