# Neural Networks vs XGBoost for Regression vs Classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.feather as feather
import torch
from tqdm.notebook import tqdm
tqdm.pandas()
import sklearn
from sklearn.linear_model import LinearRegression
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb 

First, we organize all our data. All data has been preprocessed in this notebook: https://github.com/COGS118A/Group036-SP23/blob/main/Full_dataset_processing.ipynb

In [None]:
df_lyrics = pd.read_feather('Full_Processed_Data/data_lyrics1_training').reset_index(drop=True).iloc[500000:1000000]

In [None]:
df_titles = pd.read_feather('Full_Processed_Data/data_titles_training').reset_index(drop=True).iloc[500000:1000000]

In [None]:
df_else = pd.read_feather('Full_Processed_Data/data_else_training').reset_index(drop=True).iloc[500000:1000000]
df_artists = df_else['artist']
df_tags = df_else[['tag0', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5']]
df_age = df_else['age']
df_views = df_else['encoded_views']

In [None]:
ohe = OneHotEncoder()
view_ohe = ohe.fit_transform(df_views.values.reshape(-1, 1))
df_views_ohe = pd.DataFrame(view_ohe.todense())

In [None]:
dfs = [df_lyrics, df_titles, df_artists, df_age, df_tags, df_views, df_views_ohe]
data = pd.concat(dfs, axis=1)

In [None]:
# split up data into training and validation sets
train_idx, test_idx = train_test_split(list(range(0, len(df_views))), test_size=0.2, random_state=42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

 ### Neural Networks for Regression vs Classification

In [None]:
# Get features in tensor form for neural network (ensure we only use training data here) => Also push to cuda

lyrics = torch.tensor(df_lyrics.loc[train_idx].values).type('torch.FloatTensor').to(device)
titles = torch.tensor(df_titles.loc[train_idx].values).type('torch.FloatTensor').to(device)
artists = torch.tensor(df_artists.loc[train_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
age = torch.tensor(df_age.loc[train_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
tags = torch.tensor(df_tags.loc[train_idx].values).type('torch.FloatTensor').to(device)
views = torch.tensor(df_views.loc[train_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
#views_class = torch.tensor(df_views_ohe.loc[train_idx].values).type('torch.FloatTensor').to(device)

In [None]:
print(lyrics.shape)

In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Define the neural network architecture for Regression
class SongViewCountPredictor1(nn.Module):
    def __init__(self, input_size):
        super(SongViewCountPredictor1, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sig = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sig(self.fc2(x))
        x = self.fc3(x)
        return x

# Define the neural network architecture for Classification
class SongViewCountPredictor2(nn.Module):
    def __init__(self, input_size):
        super(SongViewCountPredictor2, self).__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 20)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(0.2)
        self.soft = nn.functional.softmax()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.drop(x)
        x = self.relu(self.fc4(x))
        x = self.soft(self.fc5(x))
        return x

In [None]:
# Define a custom dataset
class SongDataset(Dataset):
    def __init__(self, lyrics, titles, artists, age, tags, views):
        self.lyrics = lyrics
        self.titles = titles
        self.artists = artists
        self.age = age
        self.tags = tags
        self.views = views

    def __len__(self):
        return len(self.views)

    def __getitem__(self, index):
        lyrics = self.lyrics[index]
        titles = self.titles[index]
        artists = self.artists[index]
        age = self.age[index]
        tags = self.tags[index]
        views = self.views[index]
        return lyrics, titles, artists, age, tags, views

# Set random seed for reproducibility
torch.manual_seed(42)

dataset = SongDataset(lyrics, titles, artists, age, tags, views)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

# Define the model and optimizer (push models to GPU)
input_size = lyrics.shape[1] + titles.shape[1] + 1 + 1 + tags.shape[1] # (artists and age are size 1)
model1 = SongViewCountPredictor1(input_size).to(device)
#model2 = SongViewCountPredictor2(input_size).to(device)
optimizer1 = optim.Adam(model1.parameters())
#optimizer2 = optim.Adam(model2.parameters())
criterion = nn.MSELoss() # using MSE as loss
#criterion2 = nn.CrossEntropyLoss() # using cross entropy as loss

In [None]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    epoch_loss1 = 0.0
    epoch_loss2 = 0.0
    for lyrics, titles, artists, age, tags, views in dataloader:
        # move batch data to GPU
        lyrics = lyrics.to(device)
        titles = titles.to(device)
        artists = artists.to(device)
        age = age.to(device)
        tags = tags.to(device)
        views = views.to(device)
        
        # Forward pass
        inputs = torch.cat((lyrics, titles, artists, age, tags), dim=1)
        outputs1 = model1(inputs)
        #outputs2 = model2(inputs)
        
        # Compute loss
        loss1 = criterion(outputs1, views)
        #loss2 = criterion(outputs2, views)
        
        # Backward pass and optimization
        optimizer1.zero_grad()
        loss1.backward()
        optimizer1.step()
        
        #optimizer2.zero_grad()
        #loss2.backward()
        #optimizer2.step()
        
        epoch_loss1 += loss1.item()
        #epoch_loss2 += loss2.item()

    print(f"Epoch [{epoch+1}/{num_epochs}]:\n Loss1: {epoch_loss1/len(dataloader):.4f} \n Loss2: {epoch_loss2/len(dataloader):.4f}\n")

In [None]:
# Transform testing data into tensors (ensure we're only locating testing data)

lyrics = torch.tensor(df_lyrics.loc[test_idx].values).type('torch.FloatTensor').to(device)
titles = torch.tensor(df_titles.loc[test_idx].values).type('torch.FloatTensor').to(device)
artists = torch.tensor(df_artists.loc[test_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
age = torch.tensor(df_age.loc[test_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
tags = torch.tensor(df_tags.loc[test_idx].values).type('torch.FloatTensor').to(device)
views = torch.tensor(df_views.loc[test_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)

In [None]:
# Testing loop
# Set the model to evaluation mode - important for batch normalization and dropout layers
model1.eval()
#model2.eval()

dataset_test = SongDataset(lyrics, titles, artists, age, tags, views)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=True)

size = len(dataloader_test.dataset)
num_batches = len(dataloader_test)

test_loss1, correct1 = 0, 0
test_loss2, correct2 = 0, 0

loss_fn = nn.MSELoss()
loss_fn2 = nn.CrossEntropyLoss()

all_predictions = []
all_labels = []

mse_vals = []
epochs = []

# Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
# also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
with torch.no_grad():
    for lyrics, titles, artists, age, tags, views in dataloader_test:
        # move batch data to GPU
        lyrics = lyrics.to(device)
        titles = titles.to(device)
        artists = artists.to(device)
        age = age.to(device)
        tags = tags.to(device)
        views = views.to(device)
        
        # predict the views count
        X = torch.cat((lyrics, titles, artists, age, tags), dim=1)
        pred1 = model1(X)
        #pred2 = model2(X)
        
        y = views
        test_loss1 += loss_fn(pred1, y).item()
        correct1 += (abs(pred1.argmax(1) - y) < 1000).type(torch.float).sum().item()
        
        all_predictions.extend(pred1.argmax(1).cpu().numpy())
        all_labels.extend(y.cpu().numpy())
        
        # Calculate the MSE and store it
        mse = loss.item()
        mse_values.append(mse)
        epochs.append(epoch)
        
        #test_loss2 += loss_fn(pred2, y).item()
        #correct2 += (pred2.argmax(1) == y).type(torch.float).sum().item()

test_loss1 /= num_batches
correct1 /= size

#test_loss2 /= num_batches
#correct2 /= size

print(f"Test Error 1: \n Accuracy: {(100*correct1):>0.1f}%, Avg loss: {test_loss1:>8f} \n Test Error 2: \n Accuracy: {(100*correct2):>0.1f}%, Avg loss: {test_loss2:>8f} \n")

In [None]:
# Scatter plot of observed vs predicted values
plt.scatter(rf_target_test, rf_predictions, alpha=0.5)
plt.title('Scatter plot of Observed vs Predicted Values')
plt.xlabel('Observed')
plt.ylabel('Predicted')
plt.show()

In [None]:
# Plot the MSE values over epochs
plt.plot(epochs, mse_values)
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.title('MSE over Epochs')
plt.show()

### XGBoost for Regression

In [None]:
#xgboost classifier using random search and the gpu

# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10], 
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, 
                        objective='multi:softmax', nthread=1, 
                        tree_method = 'gpu_hist', verbosity=3)

In [None]:
folds = 3
param_comb = 5

kf = KFold(n_splits=folds, shuffle = True, random_state = 42)


random_search = RandomizedSearchCV(xgb, param_distributions=params, 
                                   n_iter=param_comb, scoring='accuracy', 
                                   n_jobs=1, cv=kf.split(data.iloc[train_idx], 
                                                         df_views.iloc[train_idx]), verbose=3, random_state=42 )

In [None]:
random_search.fit(data.iloc[train_idx], df_views.iloc[train_idx])

In [None]:
#best parameters
random_search.best_params_

In [None]:
#test accuracy
accuracy_score(df_views.iloc[test_idx], random_search.predict(data.iloc[test_idx]))

In [None]:
#train best model on all training data
xgb_model = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, 
                              objective='multi:softmax', nthread=1, 
                              tree_method = 'gpu_hist', verbosity=3, 
                              min_child_weight=10, gamma=1.5, subsample=0.8, 
                              colsample_bytree=0.6, max_depth=4)

In [None]:
xgb_model.fit(data.iloc[train_idx], df_views.iloc[train_idx])

In [None]:
#test the model 
accuracy_score(df_views.iloc[test_idx], xgb_model.predict(data.iloc[test_idx]))

### XGBoost for Classification

In [None]:
#xgboost classifier using random search and the gpu

# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10], 
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, 
                        objective='multi:softmax', nthread=1, 
                        tree_method = 'gpu_hist', verbosity=3)

In [None]:
folds = 3
param_comb = 5

kf = KFold(n_splits=folds, shuffle = True, random_state = 42)


random_search = RandomizedSearchCV(xgb, param_distributions=params, 
                                   n_iter=param_comb, scoring='accuracy', 
                                   n_jobs=1, cv=kf.split(data.iloc[train_idx], 
                                                         df_views.iloc[train_idx]), verbose=3, random_state=42 )

In [None]:
random_search.fit(data.iloc[train_idx], df_views.iloc[train_idx])

In [None]:
#best parameters
random_search.best_params_

In [None]:
#test accuracy
accuracy_score(df_views.iloc[test_idx], random_search.predict(data.iloc[test_idx]))

In [None]:
#train best model on all training data
xgb_model = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, 
                              objective='multi:softmax', nthread=1, 
                              tree_method = 'gpu_hist', verbosity=3, 
                              min_child_weight=10, gamma=1.5, subsample=0.8, 
                              colsample_bytree=0.6, max_depth=4)

In [None]:
xgb_model.fit(data.iloc[train_idx], df_views.iloc[train_idx])

In [None]:
#test the model 
accuracy_score(df_views.iloc[test_idx], xgb_model.predict(data.iloc[test_idx]))