# ДЗ 2.
Используя датасет [seattle-weather.csv](https://drive.google.com/file/d/1vUyPXp2HVa3P4SMTZN9kgwuTu-_dlPWd/view?usp=sharing)
Предскажите погодные условия различными методами.

Входные данные:
* precipitation - величина осадков
* tempmax - максимальная дневная температура
* tempmin - минимальная дневная температура
* wind - скорость ветра

Погодные условия:
* drizzle - моросящий дождь, изморось
* rain - дождь
* sun - солнечно
* snow - снег
* fog - туман

Каждый уникальный метод оценивается 1 баллом. 

Максимальное количество баллов за ДЗ: 3.

Студент, получивший максимальную точность прогноза в группе автоматоматически получает высший балл на экзамене. Для теста использовать последние 400 строк датасета.

Отправка ноутбуков с результатами обязательна: https://forms.gle/xruukTRpumJswSpQ8

## Подготовка данных

In [95]:
import random

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [96]:
random.seed(0)
np.random.seed(0)

In [97]:
data = pd.read_csv('seattle-weather.csv')
data

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...,...
1456,2015-12-27,8.6,4.4,1.7,2.9,rain
1457,2015-12-28,1.5,5.0,1.7,1.3,rain
1458,2015-12-29,0.0,7.2,0.6,2.6,fog
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun


In [98]:
X = data.drop(columns=['date', 'weather'])
X[:] = StandardScaler().fit(X).transform(X)
y = data['weather']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=400, shuffle=False)
X_train

Unnamed: 0,precipitation,temp_max,temp_min,wind
0,-0.453650,-0.495299,-0.644212,1.014980
1,1.178598,-0.794731,-1.082347,0.875833
2,-0.333852,-0.645015,-0.206077,-0.654780
3,2.586224,-0.576962,-0.524720,1.014980
4,-0.258978,-1.026111,-1.082347,1.989006
...,...,...,...,...
1056,-0.378776,-0.958058,-0.305653,1.014980
1057,1.328345,-0.495299,-0.524720,1.293273
1058,-0.258978,-0.645015,-0.763703,0.388820
1059,2.286729,-0.345583,0.232058,0.875833


In [99]:
y_train

0       drizzle
1          rain
2          rain
3          rain
4          rain
         ...   
1056       rain
1057       rain
1058       rain
1059       rain
1060       rain
Name: weather, Length: 1061, dtype: object

In [100]:
y.unique()

array(['drizzle', 'rain', 'sun', 'snow', 'fog'], dtype=object)

## Метод ближайших соседей

In [101]:
class KNNClassifier():
    def __find_nearest(self, x: np.array):
        distances = np.sum(np.power((self.X - x), 2), axis=1)
        weather_type = self.y[distances.argmin()]
        for i in self.weather:
            if self.weather[i] == weather_type:
                weather_type = i
                break
        return weather_type

    def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> None:
        weather_types = y.unique()
        self.weather = {weather_types[i]: i for i in range(len(weather_types))}
        self.y = np.array([self.weather[i] for i in y])
        self.X = np.asarray(X)

    def predict(self, X: pd.DataFrame) -> np.ndarray:
        X = np.asarray(X)
        y = []
        for i in X:
            y.append(self.__find_nearest(i))
        return np.array(y)

In [102]:
KNNmodel = KNNClassifier()
KNNmodel.fit(X_train, y_train)
print('KNNClassifiers accuracy:', np.mean(KNNmodel.predict(X_test) == y_test.values))

KNNClassifiers accuracy: 0.67


## Градиентный спуск

In [103]:
class GradientClassifier():
    def __init__(self, lr=0.001, steps=20000) -> None:
        self.lr = lr
        self.steps = steps
        self.w = {}

    def __calc_mse_grad(self, X: np.ndarray, y: np.ndarray, w: np.ndarray) -> np.ndarray:
        return 2 / X.shape[0] * np.dot(np.transpose(X), np.subtract(np.dot(X, w), y))

    def __gradient_descent(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        w_init = np.random.uniform(-10, 10, X.shape[1])
        for i in range(self.steps):
            w_init = w_init - self.lr * self.__calc_mse_grad(X, y, w_init)
        return w_init

    def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> None:
        X = np.asarray(X)
        X = np.hstack([X, np.ones([X.shape[0], 1])])
        y = pd.get_dummies(y)
        for key in y:
            y_emp = np.asarray(y[key])
            self.w[key] = self.__gradient_descent(X, y_emp)
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        X = np.asarray(X)
        X = np.hstack([X, np.ones([X.shape[0], 1])])
        zero_data = np.zeros(shape=(len(X), len(self.w.keys())))
        y = pd.DataFrame(zero_data, columns=self.w.keys())
        for key in y:
            y[key] = np.dot(X, self.w[key])
        return pd.get_dummies(y).idxmax(axis=1)

In [104]:
Gradientmodel = GradientClassifier(0.01, 5_000)
Gradientmodel.fit(X_train, y_train)
print('GradientClassifiers accuracy:', np.mean(Gradientmodel.predict(X_test).values == y_test.values))

GradientClassifiers accuracy: 0.705


## Решающее дерево
Не написанное, а подгружаемое из библиотеки и имеющее лучший результат. 

In [105]:
from sklearn.tree import DecisionTreeClassifier

In [106]:
tree_clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
tree_clf.fit(X_train, y_train)
print('DecisionTreeClassifiers accuracy:', np.mean(tree_clf.predict(X_test) == y_test.values))

DecisionTreeClassifiers accuracy: 0.8375


# Нейронная сеть

In [107]:
import torch
from sklearn import preprocessing

In [108]:
def set_seed(seed = 123):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [109]:
class simple_model(torch.nn.Module):
    def __init__(self, in_features = 4, out_features = 5):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(in_features, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 16),
            torch.nn.ReLU(),
            torch.nn.ReLU(),
            torch.nn.Linear(16, out_features)
        )


    def forward(self, input):
        output = self.net(input)
        return output

In [110]:
set_seed(1)
model = simple_model()
loss = torch.nn.CrossEntropyLoss()
label_encoder = preprocessing.LabelEncoder().fit(y_train)
optimizer = torch.optim.Adamax(model.parameters(), lr=0.01)
for epoch in range(1000):
    X_train_inp = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_inp = torch.tensor(pd.get_dummies(label_encoder.transform(y_train)).values, dtype=torch.float32)
    y_pred = model(X_train_inp)
    optimizer.zero_grad()
    loss_value = loss(y_pred, y_train_inp)
    loss_value.backward()
    optimizer.step()
with torch.no_grad():
    X_test_inp = torch.tensor(X_test.values, dtype=torch.float32)
    lab, y_preds = torch.max(model(X_test_inp), 1)
    print((label_encoder.inverse_transform(y_preds) == y_test).mean())

0.8375
