In [1]:
import torch
import re
import pandas as pd
import numpy as np
import plotly.express as px 
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn



## Define The Dataset Class

In [2]:
class Data(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, device="cuda")
        self.y = torch.tensor(y.values, device="cuda")

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx, :], self.y[idx, :]

## Define The Model

In [3]:
class CNN1D(nn.Module):
    
    def get_conv_layer(self, in_channels, out_channels, kernel_size):
        return nn.Sequential(
            nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size),
            nn.BatchNorm1d(out_channels),
            nn.ReLU()
        )
    
    def get_fc_layer(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Linear(in_channels, out_channels),
            nn.BatchNorm1d(out_channels),
            nn.ReLU()
        )
    
    
    def __init__(self):
        super(CNN1D, self).__init__()
        self.conv1 = self.get_conv_layer(in_channels=1, out_channels=4096, kernel_size=3)
        self.conv2 = self.get_conv_layer(in_channels=4096, out_channels=2048, kernel_size=2)
        self.conv3 = self.get_conv_layer(in_channels=2048, out_channels=1024, kernel_size=2)
        self.conv4 = self.get_conv_layer(in_channels=1024, out_channels=512, kernel_size=2)
        self.conv5 = self.get_conv_layer(in_channels=512, out_channels=256, kernel_size=2)
        self.conv6 = self.get_conv_layer(in_channels=256, out_channels=128, kernel_size=2)
        self.conv7 = self.get_conv_layer(in_channels=128, out_channels=64, kernel_size=2)
        self.fc1 = self.get_fc_layer(in_channels=64, out_channels=1024)
        self.fc2 = self.get_fc_layer(in_channels=1024, out_channels=512)
        self.fc3 = self.get_fc_layer(in_channels=512, out_channels=256)
        self.fc4 = self.get_fc_layer(in_channels=256, out_channels=128)
        self.fc5 = self.get_fc_layer(in_channels=128, out_channels=64)
        self.fc6 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = x.view(-1, 1 * 64)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        x = self.fc5(x)
        x = self.fc6(x)
        return x

## The Function that Splits Digits

In [4]:
def extract_digits(phone_number):
    return list(map(int, re.findall(r'\d', phone_number)))

## Read Data and Preprocess

In [5]:
df = pd.read_csv("/kaggle/input/rond-ir/rond.ir_full_preprocessed.csv")
df["price"] = np.log1p(df["price"])

In [6]:
X = df["phone_number"].astype(str)
y = df[["price"]]

In [7]:
X = pd.DataFrame(X.apply(extract_digits).tolist())

In [8]:
X.columns = [str(i) for i in range(10)]
X.drop("0", axis=1, inplace=True)

## Split Train and Test Dataset

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [10]:
train_data = Data(X_train, y_train)
test_data = Data(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False)

## Start Training The Model 

In [11]:
torch.manual_seed(42)
model = CNN1D()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0015)
model.to("cuda")
for epoch in range(30):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print("Epoch {} loss: {}".format(epoch + 1, running_loss / len(train_loader)))

Epoch 1 loss: 5.960980793252482
Epoch 2 loss: 0.6372964584764014
Epoch 3 loss: 0.5032952586281384
Epoch 4 loss: 0.4289962944356685
Epoch 5 loss: 0.380533770773005
Epoch 6 loss: 0.31886418766163765
Epoch 7 loss: 0.2838455259350809
Epoch 8 loss: 0.25592014694203297
Epoch 9 loss: 0.23629213690440706
Epoch 10 loss: 0.21836443754005516
Epoch 11 loss: 0.20073091955525232
Epoch 12 loss: 0.18874163638192712
Epoch 13 loss: 0.17721679209077612
Epoch 14 loss: 0.1688331425137131
Epoch 15 loss: 0.15813574005473166
Epoch 16 loss: 0.15009797873332145
Epoch 17 loss: 0.14542332555279663
Epoch 18 loss: 0.1378957875020115
Epoch 19 loss: 0.13168722774093666
Epoch 20 loss: 0.12626095941514834
Epoch 21 loss: 0.12077083461884912
Epoch 23 loss: 0.11160917208523403
Epoch 24 loss: 0.10824839741070853
Epoch 25 loss: 0.1029738080483062
Epoch 26 loss: 0.10022827796241704
Epoch 27 loss: 0.096385100041351
Epoch 28 loss: 0.09261734434238351
Epoch 29 loss: 0.08928793354739323
Epoch 30 loss: 0.08592209938438015


## Print Test Loss

In [12]:
with torch.no_grad():
    model.eval()
    y_pred = model(torch.tensor(X_test.values, dtype=torch.float32, device="cuda"))
    test_loss = criterion(y_pred, torch.tensor(y_test.values, dtype=torch.float32, device="cuda"))
    print(f"Loss of the Model on Test set is :{test_loss.item()}")

Loss of the Model on Test set is :0.13569098711013794
