<a href="https://colab.research.google.com/github/Arthur-Barreto/Exoplanets/blob/main/RedeNeural/pyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating my own model using pytorch

## This notebook is based on the link: https://machinelearningmastery.com/building-a-binary-classification-model-in-pytorch/

### The goal is to make a better classifier than using the random forest model.

## Loading dataset
> The dataframe

In [1]:
import pandas as pd

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
df_filtered = pd.read_pickle("/content/gdrive/MyDrive/logArthur/df_filtered.pkl")

In [9]:
df_filtered.head()

Unnamed: 0,sky_position,max_power,period_at_max_power,transit_time_at_max_power,duration_at_max_power,has_flux,lc_flux,lc_flux_2
8,Kepler-9,27691.850489,19.276027602760276 d,143.898013,0.15 d,1,"[0.9975966720091157, 0.9973999922352278, 0.997...","[0.9975966720091157, 0.9973999922352278, 0.997..."
10,Kepler-11,3323.241291,45.373237323732376 d,148.472045,0.2 d,1,"[1.0001171265268693, 1.0000895883816208, 0.999...","[1.0001171265268693, 1.0000895883816208, 0.999..."
15,Kepler-16,150214.70284,359.41764176417644 d,352.263886,0.33 d,1,"[1.000646710395813, 1.000847578048706, 1.00074...","[1.000646710395813, 1.000847578048706, 1.00074..."
18,Kepler-19,2198.032117,65.00600060006 d,145.319322,0.15 d,1,"[1.0004145645415485, 1.0001728345558498, 1.000...","[1.0004145645415485, 1.0001728345558498, 1.000..."
19,Kepler-20,22398.1282,10.856285628562857 d,138.472644,0.33 d,1,"[0.9999672136385541, 0.9999018197503611, 0.999...","[0.9999672136385541, 0.9999018197503611, 0.999..."


## Now, check the number of 0 and 1, then filter do aplly our models

In [10]:
zeros = df_filtered[df_filtered['has_flux'] == 0]
ones = df_filtered[df_filtered['has_flux'] == 1]

num_zeros = len(zeros)
num_ones = len(ones)

print(f'num_zeros= {num_zeros} | num_ones= {num_ones}\n')

num_zeros= 300 | num_ones= 248



In [11]:
selected_zeros = zeros.sample(n=248)
selected_ones = ones.sample(n=248)

selected_data = pd.concat([selected_zeros, selected_ones])
selected_data = selected_data.reset_index()

selected_data.describe()

Unnamed: 0,index,max_power,transit_time_at_max_power,has_flux
count,496.0,496.0,496.0,496.0
mean,510.441532,48504.46,183.56593,0.5
std,283.738485,280771.1,77.921522,0.500505
min,8.0,30.53407,120.754127,0.0
25%,253.75,317.9336,138.124859,0.0
50%,526.5,1002.148,148.41688,0.5
75%,755.25,4222.253,188.113336,1.0
max,983.0,4499717.0,509.292285,1.0


## Importing PyTorch

In [12]:
import torch
import numpy as np
 
X = torch.tensor(selected_data.lc_flux_2, dtype=torch.float32)
y = torch.tensor(selected_data.has_flux, dtype=torch.float32).reshape(-1, 1)

## Creating a Model

In [None]:
# import torch.nn as nn
 
# class Wide(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.hidden = nn.Linear(64000, 10)
#         self.relu = nn.ReLU()
#         self.output = nn.Linear(10, 10)
#         self.sigmoid = nn.Sigmoid()
 
#     def forward(self, x):
#         x = self.relu(self.hidden(x))
#         x = self.sigmoid(self.output(x))
#         return x

In [13]:
import torch.nn as nn

class Deep(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(64000, 250)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(250, 250)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(250, 250)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(250, 1)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x

In [14]:
# Compare model sizes
# model1 = Wide()
model2 = Deep()
# print(sum([x.reshape(-1).shape[0] for x in model1.parameters()]))  # 11161
print(sum([x.reshape(-1).shape[0] for x in model2.parameters()]))  # 11041

16126001


## Comparing Models with Cross-Validation

In [15]:
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm

def model_train(model, X_train, y_train, X_val, y_val):
    # loss function and optimizer
    loss_fn = nn.BCELoss()  # binary cross entropy
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    n_epochs = 250   # number of epochs to run
    batch_size = 10  # size of each batch
    batch_start = torch.arange(0, len(X_train), batch_size)

    # Hold the best model
    best_acc = - np.inf   # init to negative infinity
    best_weights = None

    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start:start+batch_size]
                y_batch = y_train[start:start+batch_size]
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                acc = (y_pred.round() == y_batch).float().mean()
                bar.set_postfix(
                    loss=float(loss),
                    acc=float(acc)
                )
        # evaluate accuracy at end of each epoch
        model.eval()
        y_pred = model(X_val)
        acc = (y_pred.round() == y_val).float().mean()
        acc = float(acc)
        if acc > best_acc:
            best_acc = acc
            best_weights = copy.deepcopy(model.state_dict())
    # restore model and return best accuracy
    model.load_state_dict(best_weights)
    return best_acc

In [None]:
from sklearn.model_selection import StratifiedKFold

# define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=10, shuffle=True)
cv_scores = []
for train, test in kfold.split(X, y):
    # create model, train, and get accuracy
    model = Deep()
    acc = model_train(model, X[train], y[train], X[test], y[test])
    print("Accuracy (Deep): %.2f" % acc)
    cv_scores.append(acc)
 
# evaluate the model
acc = np.mean(cv_scores)
std = np.std(cv_scores)
print("Model accuracy: %.2f%% (+/- %.2f%%)" % (acc*100, std*100))