# Neural Networks vs KNN vs XGBoost

In [2]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-12.0.1-cp39-cp39-win_amd64.whl (21.5 MB)
     ---------------------------------------- 0.0/21.5 MB ? eta -:--:--
      --------------------------------------- 0.5/21.5 MB 10.2 MB/s eta 0:00:03
     - -------------------------------------- 0.8/21.5 MB 8.5 MB/s eta 0:00:03
     -- ------------------------------------- 1.3/21.5 MB 8.9 MB/s eta 0:00:03
     --- ------------------------------------ 1.7/21.5 MB 8.8 MB/s eta 0:00:03
     --- ------------------------------------ 2.1/21.5 MB 8.9 MB/s eta 0:00:03
     ---- ----------------------------------- 2.5/21.5 MB 9.0 MB/s eta 0:00:03
     ----- ---------------------------------- 2.9/21.5 MB 8.9 MB/s eta 0:00:03
     ------ --------------------------------- 3.3/21.5 MB 8.9 MB/s eta 0:00:03
     ------- -------------------------------- 3.8/21.5 MB 8.9 MB/s eta 0:00:02
     ------- -------------------------------- 4.2/21.5 MB 8.9 MB/s eta 0:00:02
     -------- ------------------------------- 4.6/21.5 


[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.feather as feather
import torch
from tqdm.notebook import tqdm
tqdm.pandas()
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

First, we organize all our data. All data has been preprocessed...

In [4]:
# First pull in all our data

df_lyrics = pd.read_feather('data_lyrics1_training').reset_index(drop=True).iloc[:550000]

In [5]:
df_lyrics.shape

(550000, 512)

In [6]:
df_titles = pd.read_feather('data_titles_training').reset_index(drop=True).iloc[:550000]

In [7]:
df_else = pd.read_feather('data_else_training').reset_index(drop=True).iloc[:550000]
df_artists = df_else['artist']
df_tags = df_else[['tag0', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5']]
df_age = df_else['age']
df_views = df_else['encoded_views']

In [8]:
df_views

0          9
1          6
2          3
3          3
4          5
          ..
549995    10
549996     4
549997     5
549998    11
549999     6
Name: encoded_views, Length: 550000, dtype: int64

In [9]:
ohe = OneHotEncoder()
df_view = ohe.fit_transform(df_views.values.reshape(-1, 1))
df_views = pd.DataFrame(df_view.todense())

In [10]:
df_views

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
549996,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
549997,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
549998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
dfs = [df_lyrics, df_titles, df_artists, df_age, df_tags, df_views]
data = pd.concat(dfs, axis=1)

In [12]:
# split up data into training and validation sets
train_idx, test_idx = train_test_split(list(range(0, len(df_views))), test_size=0.2, random_state=42)

Setup training for the Neural Network. We will try 3 different sets of hyper-parameters. 

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [13]:
data

Unnamed: 0,lyrics0,lyrics1,lyrics2,lyrics3,lyrics4,lyrics5,lyrics6,lyrics7,lyrics8,lyrics9,...,10,11,12,13,14,15,16,17,18,19
0,101,1031,7893,1015,1033,2004,2122,12028,11443,2149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,101,1031,7893,1015,1033,3398,1010,3398,1010,2157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,101,2017,2156,2023,3124,2023,3124,1005,1055,1999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,101,2092,2044,2035,1005,1055,2042,2056,1998,2589,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,101,1006,17174,1007,1024,1045,2064,1521,1056,2017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549995,101,1031,17174,1024,2720,1012,1047,2527,5910,1010,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
549996,101,1051,999,2360,1010,2064,2017,2156,1010,2011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
549997,101,2159,7249,2027,1005,2128,3666,2058,2296,2693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
549998,101,2057,3579,2006,2115,2331,2057,3579,2006,2115,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# torch.cuda.empty_cache()

In [18]:
# Get features in tensor form for neural network (ensure we only use training data here) => Also push to cuda

lyrics = torch.tensor(df_lyrics.loc[train_idx].values).type('torch.FloatTensor').to(device)
titles = torch.tensor(df_titles.loc[train_idx].values).type('torch.FloatTensor').to(device)
artists = torch.tensor(df_artists.loc[train_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
age = torch.tensor(df_age.loc[train_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
tags = torch.tensor(df_tags.loc[train_idx].values).type('torch.FloatTensor').to(device)
views = torch.tensor(df_views.loc[train_idx].values).type('torch.FloatTensor').to(device)

In [19]:
print(lyrics.shape[1] + titles.shape[1] + 1 + 1 + tags.shape[1])

555


In [16]:
# del lyrics, titles, artists, age, tags, views
# del model1

In [20]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Define the neural network architectures
class SongViewCountPredictor1(nn.Module):
    def __init__(self, input_size):
        super(SongViewCountPredictor1, self).__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 20)
        self.relu = nn.ReLU()
        self.sig = nn.Sigmoid()
        self.soft = nn.Softmax()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x


In [21]:
# Define a custom dataset
class SongDataset(Dataset):
    def __init__(self, lyrics, titles, artists, age, tags, views):
        self.lyrics = lyrics
        self.titles = titles
        self.artists = artists
        self.age = age
        self.tags = tags
        self.views = views

    def __len__(self):
        return len(self.views)

    def __getitem__(self, index):
        lyrics = self.lyrics[index]
        titles = self.titles[index]
        artists = self.artists[index]
        age = self.age[index]
        tags = self.tags[index]
        views = self.views[index]
        return lyrics, titles, artists, age, tags, views

# Set random seed for reproducibility
torch.manual_seed(42)

# dataset = SongDataset(lyrics, titles, artists, age, tags, views)
# dataloader = DataLoader(dataset, batch_size=2048, shuffle=True)

# Define the model and optimizer (push models to GPU)
input_size = lyrics.shape[1] + titles.shape[1] + 1 + 1 + tags.shape[1] # (artists and age are size 1)
# model1 = SongViewCountPredictor1(input_size).to(device)
# model1 = torch.Sequential(
#     nn.Linear(input_size, 1024),
#     nn.ReLU(),
#     nn.Linear(1024, 512),
#     nn.ReLU(),
#     nn.Dropout(0.2),
#     nn.Linear(512, 256),
#     nn.ReLU(),
#     nn.Linear(256, 20),
#     optim.Adam(model1.parameters(), lr=0.0001),
# ).to(device)
# model2 = SongViewCountPredictor2(input_size).to(device)
#model3 = SongViewCountPredictor3(input_size).to(device)
# optimizer1 = optim.Adam(model1.parameters(), lr=0.0001)
# optimizer2 = optim.SGD(model2.parameters(), lr=0.0025)
#optimizer3 = optim.Adam(model3.parameters(), lr=0.0025)
# criterion = nn.CrossEntropyLoss() # using MSE as lossbb

In [23]:
# #create torch sequential model with dropout
model1 = torch.nn.Sequential(
    nn.Linear(input_size, 1024),
    nn.ReLU(),
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 20),
    # nn.Softmax(dim=1) 
).to(device)

optimizer1 = optim.Adam(model1.parameters())
criterion = nn.CrossEntropyLoss()

In [24]:
dataset = SongDataset(lyrics, titles, artists, age, tags, views)
dataloader = DataLoader(dataset, batch_size=(128), shuffle=True)

In [25]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    epoch_loss1 = 0.0
    # epoch_loss2 = 0.0
    # epoch_loss3 = 0.0
    for lyrics, titles, artists, age, tags, views in dataloader:
        # move batch data to GPU
        # lyrics = lyrics.to(device)
        # titles = titles.to(device)
        # artists = artists.to(device)
        # age = age.to(device)
        # tags = tags.to(device)
        # views = views.to(device)
        # Forward pass
        inputs = torch.cat((lyrics, titles, artists, age, tags), dim=1)
        outputs1 = model1(inputs)
        
        # Compute loss
        loss1 = criterion(outputs1, torch.argmax(views, dim=1))

        # Backward pass and optimization
        optimizer1.zero_grad()
        loss1.backward()
        optimizer1.step()
        
        epoch_loss1 += loss1.item()

    print(f"Epoch [{epoch+1}/{num_epochs}]:\n Loss1: {epoch_loss1/len(dataloader):.4f}") #\n Loss2: {epoch_loss2/len(dataloader):.4f} \n Loss3: {epoch_loss3/len(dataloader):.4f}\n")

Epoch [1/10]:
 Loss1: 5.1556
Epoch [2/10]:
 Loss1: 2.3763
Epoch [3/10]:
 Loss1: 2.4105
Epoch [4/10]:
 Loss1: 2.4127
Epoch [5/10]:
 Loss1: 2.4127
Epoch [6/10]:
 Loss1: 2.6621
Epoch [7/10]:
 Loss1: 2.4117
Epoch [8/10]:
 Loss1: 2.4117
Epoch [9/10]:
 Loss1: 2.4118
Epoch [10/10]:
 Loss1: 2.4119


In [26]:
print(outputs1.shape)

torch.Size([64, 20])


In [27]:
# Transform testing data into tensors (ensure we're only locating testing data)

lyrics = torch.tensor(df_lyrics.loc[test_idx].values).type('torch.FloatTensor').to(device)
titles = torch.tensor(df_titles.loc[test_idx].values).type('torch.FloatTensor').to(device)
artists = torch.tensor(df_artists.loc[test_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
age = torch.tensor(df_age.loc[test_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
tags = torch.tensor(df_tags.loc[test_idx].values).type('torch.FloatTensor').to(device)
views = torch.tensor(df_views.loc[test_idx].values).type('torch.FloatTensor').to(device)

In [30]:
# Testing loop
# Set the model to evaluation mode - important for batch normalization and dropout layers
model1.eval()

dataset_test = SongDataset(lyrics, titles, artists, age, tags, views)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=True)

size = len(dataloader_test.dataset)
num_batches = len(dataloader_test)

test_loss1, correct1 = 0, 0

loss_fn = nn.CrossEntropyLoss()

# Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
# also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
with torch.no_grad():
    for lyrics, titles, artists, age, tags, views in dataloader_test:

        lyrics = lyrics.to(device)
        titles = titles.to(device)
        artists = artists.to(device)
        age = age.to(device)
        tags = tags.to(device)
        views = views.to(device)
        # predict the views count
        X = torch.cat((lyrics, titles, artists, age, tags), dim=1)
        pred1 = model1(X)
        # print(pred1)
        y = views
        test_loss1 += loss_fn(pred1, torch.argmax(y, dim = 1)).item()

        correct1 += (torch.argmax(nn.functional.softmax(pred1), dim=1) == torch.argmax(y, dim =1)).type(torch.float).sum().item()

test_loss1 /= num_batches
correct1 /= size
print (f"Test Error 1: \n Accuracy: {(100*correct1):>0.1f}%, Avg loss: {test_loss1:>8f}") 

Test Error 1: 
 Accuracy: 12.5%, Avg loss: 2.379799


  correct1 += (torch.argmax(nn.functional.softmax(pred1), dim=1) == torch.argmax(y, dim =1)).type(torch.float).sum().item()


Now we train and test our KNN. We will use a RandomizedSearch to find the right hyperparams for this type of algorithm. 

In [None]:
# Create a k-fold cross-validator
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

data_train = data.loc[train_idx]
X = data_train.drop(columns=['encoded_views'])
y = data_train['encoded_views']

parameters = {"n_neighbors": list(range(1, 30))}
    
model = KNeighborsClassifier()
random_search = RandomizedSearchCV(model, parameters, scoring="neg_mean_squared_error", cv=kfold, n_jobs=-1, verbose=100)
print(f"Fitting KNeighborsClassifer...")
random_search.fit(X, y)
print('\nKNN Regression Best Params: ', random_search.best_params_)

In [None]:
# Testing KNN
model = KNeighborsClassifier(n_neighbors=random_search.best_params_['n_neighbors'])
model.fit(X,y)

data_test = data.loc[test_idx]
X_test = data_test.drop(columns=['encoded_views'])
y_test = data_test['encoded_views']

y_test_hat = model.predict(X_test) 

test_accuracy = accuracy_score(y_test, y_test_hat) * 100

print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy))