In [1]:
import numpy as np
import csv
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from collections.abc import Iterable

# Utils Function

In [2]:
def normalize_feature(x,axis=0):
    mu = np.mean(x, axis, keepdims=True)
    sigma = np.std(x, axis, keepdims=True)
    return (x-mu)/sigma

# Build Dataset

In [3]:
df = pd.read_csv("train_advance.csv",encoding='utf-8',sep=',')
df.shape

(22924, 31)

In [4]:
class InsDataset(Dataset):
    """Students Performance dataset."""

    def __init__(self, df):
        # get all useful numeric column
        dataset = df[['Unnamed: 0.1','Unnamed: 0','number_comments','following','followers','n_posts','numbr_likes']]

        # remove rows with NAN
        dataset = dataset.dropna(axis=0,how='all')

        dataset = dataset[dataset['following'].str.isdigit()]
        dataset['following'] = dataset['following'].apply(pd.to_numeric)

        # convert to numpy array
        dataset = dataset.values
        
        x = dataset[:,:6]
        y = dataset[:,6]
        
        # Save target and predictors        
        x = normalize_feature(x)
        self.x = torch.from_numpy(x).float()
        self.y = torch.from_numpy(y).float()

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        if isinstance(idx, torch.Tensor):
            idx = idx.tolist()

        return [self.x[idx], self.y[idx]]

In [5]:
dataset = InsDataset(df)

# Split into training and test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
trainset, testset = random_split(dataset, [train_size, test_size])

# Dataloaders
trainloader = DataLoader(trainset, batch_size=128, shuffle=True)
testloader = DataLoader(testset, batch_size=128, shuffle=False)

# Use gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


# MLP

In [60]:
class MLPregressior(nn.Module):
    def __init__(self):
        super(MLPregressior, self).__init__()
        self.hidden1 = nn.Linear(in_features=6, out_features=128, bias=True)
        self.hidden2 = nn.Linear(128, 256)
        self.hidden3 = nn.Linear(256, 64)
        self.predict = nn.Linear(64, 1)
        self.bn1 = nn.BatchNorm1d(128)
        self.bn2 = nn.BatchNorm1d(256)
        self.bn3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = F.gelu(self.bn1(self.hidden1(x))) 
#         x = nn.Dropout()(x)
        x = F.gelu(self.bn2(self.hidden2(x)))
#         x = nn.Dropout()(x)
        x = F.gelu(self.bn3(self.hidden3(x)))
#         x = nn.Dropout()(x)
        output = self.predict(x)
        return output[:, 0]


In [61]:
model = MLPregressior().to(device)

In [62]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1)

In [63]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = torch.sqrt(loss_fn(pred, y))

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss, current = loss.item(), batch * len(X)
    print(f"Train loss: {loss:>7f}")

In [64]:
def validate(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += torch.sqrt(loss_fn(pred, y))
    test_loss /= num_batches

    print(f"Valid loss: {test_loss:>8f} \n")

In [65]:
epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(trainloader, model, loss_fn, optimizer)
    validate(testloader, model, loss_fn)
    scheduler.step()
print("Done!")

Epoch 1
-------------------------------
Train loss: 10887.375000
Valid loss: 18356.144531 

Epoch 2
-------------------------------
Train loss: 5684.979980
Valid loss: 17914.343750 

Epoch 3
-------------------------------
Train loss: 22272.724609
Valid loss: 16856.742188 

Epoch 4
-------------------------------
Train loss: 6119.857422
Valid loss: 17258.867188 

Epoch 5
-------------------------------
Train loss: 7786.192871
Valid loss: 15556.566406 

Epoch 6
-------------------------------
Train loss: 4520.923340
Valid loss: 13863.684570 

Epoch 7
-------------------------------
Train loss: 5374.885254
Valid loss: 13002.815430 

Epoch 8
-------------------------------
Train loss: 4471.912109
Valid loss: 15402.026367 

Epoch 9
-------------------------------
Train loss: 6153.890137
Valid loss: 12657.201172 

Epoch 10
-------------------------------
Train loss: 18258.878906
Valid loss: 16035.279297 

Epoch 11
-------------------------------
Train loss: 7790.268066
Valid loss: 13824.699

Train loss: 3792.409424
Valid loss: 11964.571289 

Epoch 91
-------------------------------
Train loss: 4146.991211
Valid loss: 13758.085938 

Epoch 92
-------------------------------
Train loss: 17343.656250
Valid loss: 12014.347656 

Epoch 93
-------------------------------
Train loss: 3863.460938
Valid loss: 12303.448242 

Epoch 94
-------------------------------
Train loss: 4805.824707
Valid loss: 12574.397461 

Epoch 95
-------------------------------
Train loss: 4289.432129
Valid loss: 12416.461914 

Epoch 96
-------------------------------
Train loss: 4989.708008
Valid loss: 12020.251953 

Epoch 97
-------------------------------
Train loss: 4067.555176
Valid loss: 12256.989258 

Epoch 98
-------------------------------
Train loss: 9870.146484
Valid loss: 12463.409180 

Epoch 99
-------------------------------
Train loss: 5084.926758
Valid loss: 12147.635742 

Epoch 100
-------------------------------
Train loss: 10470.183594
Valid loss: 11932.828125 

Done!


In [441]:
df_test = pd.read_csv("test_advance.csv",encoding='utf-8',sep=',')
df_test.shape

(5731, 32)

In [446]:
dataset = df_test[['Unnamed: 0.1','Unnamed: 0','number_comments','following','followers','n_posts']]

# remove rows with NAN
dataset = dataset.dropna(axis=0,how='all')

#     dataset = dataset[dataset['following'].str.isdigit()]
dataset['following'] = dataset['following'].apply(pd.to_numeric)

# convert to numpy array
dataset = dataset.values

#     print(dataset)

dataset = normalize_feature(dataset)
dataset = torch.from_numpy(dataset).float()

# Save target and predictors
x = dataset.to(device)

model.eval()
y = model(x)
y

tensor([ 7800.9351,  4196.2764,  5653.3633,  ...,  1309.9447,   657.8683,
        18489.3867], device='cuda:0', grad_fn=<SelectBackward0>)