In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np

In [None]:
from sklearn.impute import SimpleImputer
def get_data_loader(file, features, features_to_encode):
    df = pd.read_csv(file)
    imputer = SimpleImputer(strategy="median")
    df[["Episode_Length_minutes"]] = imputer.fit_transform(df[["Episode_Length_minutes"]])
    imputer = SimpleImputer(strategy="most_frequent")
    df[["Number_of_Ads"]] = imputer.fit_transform(df[["Number_of_Ads"]])
    def one_hot(df, feature):
        encoded = pd.get_dummies(df[[feature]])
        result = pd.concat([df, encoded], axis=1)
        result = result.drop([feature], axis=1)
        return(result) 
    y = df[["Listening_Time_minutes"]]
    df = df[features]
    for to_encode in features_to_encode:
        df = one_hot(df, to_encode)
    x = df
    x_tensor = torch.from_numpy(x.to_numpy().astype(np.float32))
    y_tensor = torch.from_numpy(y.to_numpy().astype(np.float32))
    dataset = TensorDataset(x_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
    return dataloader

features=["Episode_Length_minutes", "Number_of_Ads", "Episode_Sentiment"]
features_to_encode = ["Episode_Sentiment"]
train_dataloader = get_data_loader("train.csv", 
    features=features,
    features_to_encode=features_to_encode )

In [2]:
class PodcastPredictor(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(5, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024,  2048),
            nn.ReLU(),
            nn.Linear(2048, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
    def forward(self, x):
        return self.net(x)

In [4]:
def rmse_loss(y_hat, y):
    return torch.sqrt(F.mse_loss(y_hat, y))

In [9]:
lr = 1e-4
lam = 0
num_epochs = 5

In [6]:
from torchinfo import summary

In [7]:
model = PodcastPredictor()
print(summary(model))

Layer (type:depth-idx)                   Param #
PodcastPredictor                         --
├─Sequential: 1-1                        --
│    └─Linear: 2-1                       1,536
│    └─ReLU: 2-2                         --
│    └─Linear: 2-3                       131,584
│    └─ReLU: 2-4                         --
│    └─Linear: 2-5                       525,312
│    └─ReLU: 2-6                         --
│    └─Linear: 2-7                       2,099,200
│    └─ReLU: 2-8                         --
│    └─Linear: 2-9                       8,392,704
│    └─ReLU: 2-10                        --
│    └─Linear: 2-11                      8,390,656
│    └─ReLU: 2-12                        --
│    └─Linear: 2-13                      2,098,176
│    └─ReLU: 2-14                        --
│    └─Linear: 2-15                      524,800
│    └─ReLU: 2-16                        --
│    └─Linear: 2-17                      131,328
│    └─ReLU: 2-18                        --
│    └─Linear: 2-19 

In [None]:

optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=lam)


for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    i = 0
    for xb, yb in train_dataloader:
        y_hat = model(xb)
        loss = rmse_loss(y_hat, yb)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total_loss += loss.item() * xb.size(0)
        if i % 1000 == 0:
            print(i)
        i += 1
    print(f"Epoch {epoch+1}, RMSE: {total_loss / len(train_dataloader.dataset):.4f}")


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
Epoch 1, RMSE: 10.9282
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
Epoch 2, RMSE: 10.7488
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
Epoch 3, RMSE: 10.7059
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
Epoch 4, RMSE: 10.6783
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
Epoch 5, RMSE: 10.6660


In [11]:
torch.save(model, "model.pth")

In [3]:
model = torch.load("model1.pth", weights_only=False)

In [7]:
from sklearn.impute import SimpleImputer
def get_test_loader(file, features, features_to_encode):
    df = pd.read_csv(file)
    def one_hot(df, feature):
        encoded = pd.get_dummies(df[[feature]])
        result = pd.concat([df, encoded], axis=1)
        result = result.drop([feature], axis=1)
        return(result) 
    df = df[features]
    imputer = SimpleImputer(strategy="median")
    df[["Episode_Length_minutes"]] = imputer.fit_transform(df[["Episode_Length_minutes"]])
    imputer = SimpleImputer(strategy="most_frequent")
    df[["Number_of_Ads"]] = imputer.fit_transform(df[["Number_of_Ads"]])
    for to_encode in features_to_encode:
        df = one_hot(df, to_encode)
    x = df
    x_tensor = torch.from_numpy(x.to_numpy().astype(np.float32))
    y_tensor = torch.from_numpy(np.zeros(250000).astype(np.float32))
    dataset = TensorDataset(x_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=False)
    return dataloader

features=["Episode_Length_minutes", "Number_of_Ads", "Episode_Sentiment"]
features_to_encode = ["Episode_Sentiment"]
test_dataloader = get_test_loader("test.csv", 
    features=features,
    features_to_encode=features_to_encode )

In [9]:
from csv import writer

id = 750000
rows = []

model.eval()

with torch.no_grad():
    for i, (xb, yb) in enumerate(test_dataloader):
        y_hat = model(xb)
        for pred in y_hat:
            rows.append([id, pred.item()])
            id += 1
        if (i+1) % 25000 == 0:
            print(f"Processed {i+1} batches")

with open("submission.csv", "w", newline='') as f:
    writer_object = writer(f)
    writer_object.writerow(["id", "prediction"])  # Add header
    writer_object.writerows(rows)
    f.close()