In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.feather as feather
import torch
from tqdm.notebook import tqdm
tqdm.pandas()
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
file_name = 'data_feather_embededAll'
df = pd.read_feather(file_name)

In [None]:
df

In [None]:
df_small = df.iloc[:10000]

In [None]:
df_small[df_small['views'] > 5000]

In [None]:
## Below we're training the small dataset on a neural network

In [None]:
import torch

df_titles = pd.read_feather('data_feather_embededTitles')
df_titles = df_titles.iloc[:10000]
df_lyrics = pd.read_feather('data_feather_embededLyrics')
df_lyrics = df_lyrics.iloc[:10000]
df_else = pd.read_feather('data_feather_embededElse')
df_else = df_else.iloc[:10000]
df_artists = df_else['artist']
df_tags = df_else[['0', '1', '2', '3', '4', '5']]
df_age = df_else['song_age']
df_views = df_else['views']

lyrics = torch.tensor(df_lyrics.values).type('torch.FloatTensor')
titles = torch.tensor(df_titles.values).type('torch.FloatTensor')
artists = torch.tensor(df_artists.values).unsqueeze(1).type('torch.FloatTensor')
age = torch.tensor(df_age.values).unsqueeze(1).type('torch.FloatTensor')
tags = torch.tensor(df_tags.values).type('torch.FloatTensor')
views = torch.tensor(df_views.values).unsqueeze(1).type('torch.FloatTensor')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Define the neural network architecture
class SongViewCountPredictor(nn.Module):
    def __init__(self, input_size):
        super(SongViewCountPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Define a custom dataset
class SongDataset(Dataset):
    def __init__(self, lyrics, titles, artists, age, tags, views):
        self.lyrics = lyrics
        self.titles = titles
        self.artists = artists
        self.age = age
        self.tags = tags
        self.views = views

    def __len__(self):
        return len(self.views)

    def __getitem__(self, index):
        lyrics = self.lyrics[index]
        titles = self.titles[index]
        artists = self.artists[index]
        age = self.age[index]
        tags = self.tags[index]
        views = self.views[index]
        return lyrics, titles, artists, age, tags, views

# Set random seed for reproducibility
torch.manual_seed(42)

dataset = SongDataset(lyrics, titles, artists, age, tags, views)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# Define the model and optimizer
input_size = lyrics.shape[1] + titles.shape[1] + 1 + 1 + tags.shape[1]
model = SongViewCountPredictor(input_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for lyrics, titles, artists, age, tags, views in dataloader:
        # Forward pass
        inputs = torch.cat((lyrics, titles, artists, age, tags), dim=1)
        outputs = model(inputs)
        
        # Compute loss
        loss = criterion(outputs, views)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy
        # predicted_counts = outputs.squeeze().round().detach().numpy()
        # correct = (predicted_counts == view_count.numpy()).sum()
        # total_correct += correct
        # total_samples += view_count.size(0)
        
        epoch_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(dataloader):.4f}")

In [None]:
## Below we're doing KNN

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

y_train = df_small['views']
X_train = df_small.drop(columns=['views'])

# Create and train the KNN regressor
k = 11  # Number of neighbors
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, y_train)

In [None]:
## Evaluate both models

In [None]:
tester = 9899

In [None]:
# Predict view counts for a new song
new_song_features = pd.DataFrame(df.iloc[tester].drop('views'))  # Features for the new song (numpy array)
predicted_view_count = knn.predict(new_song_features.transpose())
print("Predicted view count:", predicted_view_count[0])
print("Actual view count: ", df.iloc[tester]['views'])

In [None]:
# Predict view counts for a new song
df_titles = pd.read_feather('data_feather_embededTitles')
df_titles = df_titles.iloc[tester]
df_lyrics = pd.read_feather('data_feather_embededLyrics')
df_lyrics = df_lyrics.iloc[tester]
df_else = pd.read_feather('data_feather_embededElse')
df_else = df_else.iloc[tester]
df_artists = df_else['artist']
df_tags = df_else[['0', '1', '2', '3', '4', '5']]
df_age = df_else['song_age']
df_views = df_else['views']

In [None]:
lyrics = torch.tensor(df_lyrics.values).type('torch.FloatTensor')
titles = torch.tensor(df_titles.values).type('torch.FloatTensor')
artists = torch.tensor(df_artists).unsqueeze(0).type('torch.FloatTensor')
age = torch.tensor(df_age).unsqueeze(0).type('torch.FloatTensor')
tags = torch.tensor(df_tags.values).type('torch.FloatTensor')
views = torch.tensor(df_views).unsqueeze(0).type('torch.FloatTensor')

new_input = torch.cat((lyrics, titles, artists, age, tags), dim=0)
predicted_view_count = model(new_input)
print("Predicted view count:", predicted_view_count.item())
print("Actual view count: ", views)

In [None]:
# Evaluate the model
train_score = knn.score(X_train, y_train)
print(f"Train R^2 score: {train_score:.4f}")

# Neural Networks vs KNN vs Random Forest Regression for 100000 datapoints

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.feather as feather
import torch
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.linear_model import LinearRegression
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn import neighbors

First, we organize all our data. This test is using data with 100,000 datapoints that have been pre-processed elsewhere such that the Lyrics and Titles have been word embedded, the tags have been one-hot encoded, the age and views have been transformed, and the artists have been label encoded (though we will probably re-encode the artists by ranking them first and then labeling).

In [None]:
# First pull in all our data

df_all = pd.read_feather('data_feather_embededAll')
df_lyrics = pd.read_feather('data_feather_embededLyrics')
df_titles = pd.read_feather('data_feather_embededTitles')
df_else = pd.read_feather('data_feather_embededElse')
df_artists = df_else['artist']
df_tags = df_else[['0', '1', '2', '3', '4', '5']]
df_age = df_else['song_age']
df_views = df_else['views']

dfs = [df_all, df_lyrics, df_titles, df_artists, df_age, df_tags, df_views]

In [None]:
# split up data into training and testing sets
train_idx, test_idx = train_test_split(list(range(0, 100000)), test_size=0.2, random_state=42)

Setup training for the Neural Network. We will try 3 different sets of hyper-parameters. 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Get features in tensor form for neural network (ensure we only use training data here) => Also push to cuda

lyrics = torch.tensor(df_lyrics.loc[train_idx].values).type('torch.FloatTensor').to(device)
titles = torch.tensor(df_titles.loc[train_idx].values).type('torch.FloatTensor').to(device)
artists = torch.tensor(df_artists.loc[train_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
age = torch.tensor(df_age.loc[train_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
tags = torch.tensor(df_tags.loc[train_idx].values).type('torch.FloatTensor').to(device)
views = torch.tensor(df_views.loc[train_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)

In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Define the neural network architectures
class SongViewCountPredictor1(nn.Module):
    def __init__(self, input_size):
        super(SongViewCountPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.fc4 = nn.Linear(32, 16)
        self.fc5 = nn.Linear(16, 4)
        self.fc6 = nn.Linear(4, 1)
        self.relu = nn.ReLU()
        self.sig = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class SongViewCountPredictor2(SongViewCountPredictor1):
    def __init__(self, input_size):
        super(SongViewCountPredictor, self).__init__()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)
        x = self.fc6(x)
        return x

class SongViewCountPredictor3(SongViewCountPredictor1):
    def __init__(self, input_size):
        super(SongViewCountPredictor, self).__init__()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sig(self.fc4(x))
        x = self.sig(self.fc5(x))
        x = self.fc6(x)
        return x

In [None]:
# Define a custom dataset
class SongDataset(Dataset):
    def __init__(self, lyrics, titles, artists, age, tags, views):
        self.lyrics = lyrics
        self.titles = titles
        self.artists = artists
        self.age = age
        self.tags = tags
        self.views = views

    def __len__(self):
        return len(self.views)

    def __getitem__(self, index):
        lyrics = self.lyrics[index]
        titles = self.titles[index]
        artists = self.artists[index]
        age = self.age[index]
        tags = self.tags[index]
        views = self.views[index]
        return lyrics, titles, artists, age, tags, views

# Set random seed for reproducibility
torch.manual_seed(42)

dataset = SongDataset(lyrics, titles, artists, age, tags, views)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

# Define the model and optimizer (push models to GPU)
input_size = lyrics.shape[1] + titles.shape[1] + 1 + 1 + tags.shape[1]
model1 = SongViewCountPredictor1(input_size).to(device)
model2 = SongViewCountPredictor2(input_size).to(device)
model3 = SongViewCountPredictor3(input_size).to(device)
optimizer1 = optim.Adam(model1.parameters(), lr=0.005)
optimizer2 = optim.Adam(model2.parameters(), lr=0.005)
optimizer3 = optim.Adam(model3.parameters(), lr=0.005)
criterion = nn.MSELoss() # using MSE as loss

In [None]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    epoch_loss1 = 0.0
    epoch_loss2 = 0.0
    epoch_loss3 = 0.0
    for lyrics, titles, artists, age, tags, views in dataloader:
        # move batch data to GPU
        lyrics = lyrics.to(device)
        titles = titles.to(device)
        artists = artists.to(device)
        age = age.to(device)
        tags = tags.to(device)
        views = views.to(device)
        
        # Forward pass
        inputs = torch.cat((lyrics, titles, artists, age, tags), dim=1)
        outputs1 = model1(inputs)
        outputs2 = model2(inputs)
        outputs3 = model3(inputs)
        
        # Compute loss
        loss1 = criterion(outputs1, views)
        loss2 = criterion(outputs2, views)
        loss3 = criterion(outputs3, views)
        
        # Backward pass and optimization
        optimizer1.zero_grad()
        loss1.backward()
        optimizer1.step()
        
        optimizer2.zero_grad()
        loss2.backward()
        optimizer2.step()
        
        optimizer3.zero_grad()
        loss3.backward()
        optimizer3.step()
        
        epoch_loss1 += loss1.item()
        epoch_loss2 += loss2.item()
        epoch_loss3 += loss3.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss1: {epoch_loss1/len(dataloader):.4f}, \n
                                            Loss2: {epoch_loss2/len(dataloader):.4f}, \n
                                            Loss3: {epoch_loss3/len(dataloader):.4f}\n")

In [None]:
# Transform testing data into tensors (ensure we're only locating testing data)

lyrics = torch.tensor(df_lyrics.loc[test_idx].values).type('torch.FloatTensor').to(device)
titles = torch.tensor(df_titles.loc[test_idx].values).type('torch.FloatTensor').to(device)
artists = torch.tensor(df_artists.loc[test_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
age = torch.tensor(df_age.loc[test_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)
tags = torch.tensor(df_tags.loc[test_idx].values).type('torch.FloatTensor').to(device)
views = torch.tensor(df_views.loc[test_idx].values).unsqueeze(1).type('torch.FloatTensor').to(device)

In [None]:
# Testing loop
# Set the model to evaluation mode - important for batch normalization and dropout layers
model1.eval()
model2.eval()
model3.eval()

dataset_test = SongDataset(lyrics, titles, artists, age, tags, views)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=True)

size = len(dataloader_test.dataset)
num_batches = len(dataloader_test)

test_loss1, correct1 = 0, 0
test_loss2, correct2 = 0, 0
test_loss3, correct3 = 0, 0

loss_fn = nn.MSELoss()

# Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
# also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
with torch.no_grad():
    for X, y in dataloader_test:
        pred1 = model1(X)
        pred2 = model2(X)
        pred3 = model3(X)
        
        test_loss1 += loss_fn(pred1, y).item()
        correct1 += (pred.argmax(1) == y).type(torch.float).sum().item()
        
        test_loss2 += loss_fn(pred2, y).item()
        correct2 += (pred.argmax(1) == y).type(torch.float).sum().item()
        
        test_loss3 += loss_fn(pred3, y).item()
        correct3 += (pred.argmax(1) == y).type(torch.float).sum().item()

test_loss1 /= num_batches
correct1 /= size

test_loss2 /= num_batches
correct2 /= size

test_loss3 /= num_batches
correct3 /= size

print(f"Test Error 1: \n Accuracy: {(100*correct1):>0.1f}%, Avg loss: {test_loss1:>8f} \n
        Test Error 2: \n Accuracy: {(100*correct2):>0.1f}%, Avg loss: {test_loss2:>8f} \n
        Test Error 3: \n Accuracy: {(100*correct3):>0.1f}%, Avg loss: {test_loss3:>8f} \n")

Now we train and test our KNN. We will use a GridSearch to find the right hyperparams for this type of algorithm. 

In [None]:
# Create a k-fold cross-validator
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

data = df_all.loc[train_idx]
X = data.drop(columns=['views'])
y = data['views']

parameters = {"n_neighbors": list(range(1, 30))}
    
model = KNeighborsRegressor()
grid_search = RandomSearchCV(model, parameters, scoring="neg_mean_squared_error", cv=kfold, n_jobs=-1, verbose=100)
print(f"Fitting KNeighborsRegressor...")
grid_search.fit(X, y)
print('\nKNN Regression Best Params: ', grid_search.best_params_)

In [None]:
# Testing KNN
model = KNeighborsRegressor(n_neighbors=_)
model.fit(X, y)

data = df_all.loc[test_idx]
X_test = data.drop(columns=['views'])
y_test = data['views']

y_test_hat = model.predict(X_test) 

test_accuracy = accuracy_score(y_test, y_test_hat) * 100

print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy))