## Prepare the libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

## Upload the data

In [4]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


## Load the data

In [5]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,name,release_date,artists,total_tracks,t_name0,t_name1,t_name2,t_dur0,t_dur1,...,t_val0,t_val1,t_val2,t_tempo0,t_tempo1,t_tempo2,t_sig0,t_sig1,t_sig2,popularity
0,0,Ya Solo Eres Mi Ex,2021-09-09,"La Adictiva Banda San José de Mesillas,",7,Ya Solo Eres Mi Ex,La Malagueña,México Lindo y Querido,222133,189133.0,...,0.626,0.878,0.833,74.924,126.97,197.836,4.0,3.0,3.0,51
1,1,Nibiru,2019-11-29,"Ozuna,",18,Nibiru,Hasta Que Salga el Sol,Temporal,158826,188480.0,...,0.158,0.598,0.484,173.023,96.042,148.019,4.0,4.0,4.0,67
2,2,DAMN.,2017-04-14,"Kendrick Lamar,",14,BLOOD.,DNA.,YAH.,118066,185946.0,...,0.494,0.422,0.648,156.907,139.913,69.986,4.0,4.0,4.0,81
3,3,Did I Shave My Legs For This?,1996-01-01,"Deana Carter,",11,I've Loved Enough To Know,We Danced Anyway,Count Me In,203693,202533.0,...,0.755,0.651,0.263,120.285,94.054,130.011,4.0,4.0,4.0,53
4,4,glisten,2018-05-04,"Jeremy Zucker,",4,all the kids are depressed,wildfire,glisten (interlude),169973,163000.0,...,0.707,0.426,0.466,103.861,129.17,116.349,4.0,4.0,4.0,70


## Preprocessing the data
We will standardize the data to make all features in same level to improve our neural network.

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

features = ['total_tracks', 't_dur0', 't_dur1', 't_dur2', 't_key0', 't_key1', 't_key2',
            't_mode0', 't_mode1', 't_mode2', 't_acous0', 't_acous1', 't_acous2',
            't_dance0', 't_dance1', 't_dance2', 't_energy0', 't_energy1', 't_energy2',
            't_ins0', 't_ins1', 't_ins2', 't_live0', 't_live1', 't_live2',
            't_speech0', 't_speech1', 't_speech2', 't_val0', 't_val1', 't_val2', 't_tempo0',
            't_tempo1', 't_tempo2', 'popularity']

data = data[features]
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
scaler = MinMaxScaler(feature_range=(-1, 1))

data = imputer.fit_transform(data)
data = scaler.fit_transform(data)

In [7]:
print(len(features))
print(len(data[0]))
data.shape

35
35


(160000, 35)

## Split data into train set and test set

In [8]:
propential = int(len(data) * 0.8)
train_set = data[:propential]
test_set = data[propential:]
X_train = train_set[:, :-1]
y_train = train_set[:, -1]
X_test = test_set[:, :-1]
y_test = test_set[:, -1]

In [9]:
class ANN_model(nn.Module):
  def __init__(self, output_shape, hidden_units, input_shape=34):
    super().__init__()
    self.linear1 = nn.Linear(input_shape, hidden_units)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(hidden_units, output_shape)
    self.dropout = nn.Dropout(p=0.2)

  def forward(self, x):
    x = self.linear1(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.linear2(x)

    return x

In [10]:
batch_size = 1000
epochs = 30
learning_rate = 0.001

torch.manual_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = ANN_model(1, 34).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
device

'cuda'

## Train Loop

In [12]:
for epoch in range(epochs):
    total_loss = 0
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        X_batch = torch.from_numpy(X_batch).float().to(device)
        y_batch = torch.from_numpy(y_batch).float().to(device)
        y_pred = model(X_batch)
        y_pred = torch.squeeze(y_pred)
        loss = criterion(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if i % 8000 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Batch {i//batch_size}/{len(X_train)//batch_size}, Loss: {loss.item():.4f}')

    average_loss = total_loss / (len(X_train) / batch_size)
    print(f'Epoch {epoch+1}/{epochs}, Average Loss: {average_loss:.4f}')


Epoch 1/30, Batch 0/128, Loss: 0.2430
Epoch 1/30, Batch 8/128, Loss: 0.2376
Epoch 1/30, Batch 16/128, Loss: 0.2137
Epoch 1/30, Batch 24/128, Loss: 0.2169
Epoch 1/30, Batch 32/128, Loss: 0.1886
Epoch 1/30, Batch 40/128, Loss: 0.1869
Epoch 1/30, Batch 48/128, Loss: 0.2012
Epoch 1/30, Batch 56/128, Loss: 0.1882
Epoch 1/30, Batch 64/128, Loss: 0.1882
Epoch 1/30, Batch 72/128, Loss: 0.1763
Epoch 1/30, Batch 80/128, Loss: 0.1809
Epoch 1/30, Batch 88/128, Loss: 0.1933
Epoch 1/30, Batch 96/128, Loss: 0.1807
Epoch 1/30, Batch 104/128, Loss: 0.1849
Epoch 1/30, Batch 112/128, Loss: 0.1760
Epoch 1/30, Batch 120/128, Loss: 0.1916
Epoch 1/30, Average Loss: 0.1943
Epoch 2/30, Batch 0/128, Loss: 0.1866
Epoch 2/30, Batch 8/128, Loss: 0.1974
Epoch 2/30, Batch 16/128, Loss: 0.1820
Epoch 2/30, Batch 24/128, Loss: 0.1933
Epoch 2/30, Batch 32/128, Loss: 0.1705
Epoch 2/30, Batch 40/128, Loss: 0.1723
Epoch 2/30, Batch 48/128, Loss: 0.1826
Epoch 2/30, Batch 56/128, Loss: 0.1762
Epoch 2/30, Batch 64/128, Loss: 

## Evaluate the model

In [13]:
model.eval()
total_test_loss = 0
with torch.inference_mode():
    for i in range(0, len(X_test), batch_size):
        X_test_batch = X_test[i:i+batch_size]
        y_test_batch = y_test[i:i+batch_size]
        X_test_batch = torch.from_numpy(X_test_batch).float().to(device)
        y_test_batch = torch.from_numpy(y_test_batch).float().to(device)
        y_test_pred = model(X_test_batch)
        y_test_pred = torch.squeeze(y_test_pred)
        test_loss = criterion(y_test_pred, y_test_batch)
        total_test_loss += test_loss.item()

average_test_loss = total_test_loss / (len(X_test) / batch_size)
print(f'Test Average Loss: {average_test_loss:.4f}')

Test Average Loss: 0.1482


## Make an one-time prediction

In [14]:
def make_prediction(model: torch.nn.Module, data: np.ndarray):

    device = torch.device('cpu')
    model.to(device)
    model.eval()

    with torch.inference_mode():
        data_tensor = torch.from_numpy(data).float().to(device)
        prediction = model(data_tensor)
        prediction = torch.squeeze(prediction).item()
    return prediction

In [38]:
import random
random.seed(70)
test_data = X_test[random.randint(0, len(X_test)-1)]
test_label = y_test[random.randint(0, len(y_test)-1)]
print(f'Test data: {test_data}, Test label: {test_label}')

Test data: [-1.         -0.97111494 -0.94059864 -0.93159311 -1.         -0.05337634
 -0.04003724  1.          0.26327802  0.2537688  -0.77710843 -0.46438082
 -0.44719595  0.68016194  0.22702953  0.24159615  0.358       0.25744354
  0.23285038 -0.99426    -0.73076417 -0.73389246 -0.87859296 -0.60528969
 -0.6178564  -0.75206612 -0.75665616 -0.75311165 -0.03       -0.01380054
 -0.02989168 -0.21566206 -0.0253843  -0.02078887], Test label: 0.07070707070707094


In [39]:
pred = make_prediction(model, test_data)

print(f'Predicted Label: {pred} Real Value: {test_label}')

Predicted Label: -0.18883253633975983 Real Value: 0.07070707070707094


In [40]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler(feature_range=(-1, 1))

data_min = 0
data_max = 100

scaler.min_ = -1 - data_min * 2 / (data_max - data_min)
scaler.scale_ = 2 / (data_max - data_min)

normalized_data = np.array([[pred], [test_label]])

original_values = scaler.inverse_transform(normalized_data)

print(f"Predicted label (original scale): {original_values[0][0]:.2f}")
print(f"Real value (original scale): {original_values[1][0]:.2f}")

Predicted label (original scale): 40.56
Real value (original scale): 53.54


# Save the model

In [27]:
torch.save(model.state_dict(), 'ANN_model.pth')

# Construct Random Forest Model

In [28]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error



# Create Random Forest Regrassion Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make a pred based on test set.
y_pred = rf.predict(X_test)

# Calculate the Mean Square
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.04270162491050353


## Make a one time prediction

In [41]:
sample_data = test_data.reshape(1, -1)
predicted_value = rf.predict(sample_data)[0]

print(f'Predicted Label: {predicted_value} Real Value: {test_label}')

Predicted Label: -0.050505050505050254 Real Value: 0.07070707070707094


In [42]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler(feature_range=(-1, 1))

data_min = 0
data_max = 100

scaler.min_ = -1 - data_min * 2 / (data_max - data_min)
scaler.scale_ = 2 / (data_max - data_min)

normalized_data = np.array([[predicted_value], [test_label]])

original_values = scaler.inverse_transform(normalized_data)

print(f"Predicted label (original scale): {original_values[0][0]:.2f}")
print(f"Real value (original scale): {original_values[1][0]:.2f}")

Predicted label (original scale): 47.47
Real value (original scale): 53.54
