In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [3]:
df_train = pd.read_csv('train.csv')

In [4]:
df_train.dropna(inplace=True)

In [5]:
df_train.isnull().sum()

id                      0
Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Vehicle Age             0
Credit Score            0
Insurance Duration      0
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
Premium Amount          0
dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
objs = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
df_train[objs] = df_train[objs].apply(label_encoder.fit_transform)


In [7]:
X = df_train.drop(['id', 'Premium Amount'], axis=1)
y = df_train[['Premium Amount']]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
X_train = torch.from_numpy(X_train.to_numpy()).float()
X_test = torch.from_numpy(X_test.to_numpy()).float()
y_train = torch.from_numpy(y_train.to_numpy()).float()
y_test = torch.from_numpy(y_test.to_numpy()).float()



In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
batch_size = 1024
dataset = TensorDataset(X_train, y_train)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [13]:
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [58]:
input_size = X_train.shape[1]
model = SimpleNN(input_size).to(device)
criterion = nn.SmoothL1Loss()  # For regression
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [60]:
num_epochs = 1000
for epoch in range(num_epochs):
    for batch_idx, (X_batch, y_batch) in enumerate(data_loader):
        # Forward pass
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print loss every 50 epochs
    if (epoch + 1) % 50 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [50/1000], Loss: 641.3911
Epoch [100/1000], Loss: 636.8344
Epoch [150/1000], Loss: 632.3499
Epoch [200/1000], Loss: 631.5598
Epoch [250/1000], Loss: 631.9533
Epoch [300/1000], Loss: 627.0450
Epoch [350/1000], Loss: 630.9145
Epoch [400/1000], Loss: 628.1098
Epoch [450/1000], Loss: 627.4093
Epoch [500/1000], Loss: 633.1374
Epoch [550/1000], Loss: 631.7408
Epoch [600/1000], Loss: 631.1218
Epoch [650/1000], Loss: 625.0931
Epoch [700/1000], Loss: 628.6318
Epoch [750/1000], Loss: 622.1443
Epoch [800/1000], Loss: 631.8849
Epoch [850/1000], Loss: 615.1452
Epoch [900/1000], Loss: 618.3213
Epoch [950/1000], Loss: 624.0907
Epoch [1000/1000], Loss: 619.5024


In [34]:
df_test = pd.read_csv('test.csv')

In [35]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
objs = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
df_test[objs] = df_test[objs].apply(label_encoder.fit_transform)


In [37]:
def calRMSLE(y_test, y_pred):
    from sklearn.metrics import mean_squared_error

    y_test = np.log1p(y_test)
    y_pred = np.clip(y_pred, 0, None)
    y_pred = np.log1p(y_pred)

    mse = mean_squared_error(y_test, y_pred)
    rmsle = np.sqrt(mse)

    return rmsle




In [62]:
# Switch to evaluation mode
model.eval()

# Assume X_test is the data you want to make predictions on
# Move X_test to the same device as the model
X_test = X_test.to(device)

# Use torch.no_grad() to turn off gradients
with torch.no_grad():
    # Pass the test data through the model
    predictions = model(X_test)

# Convert predictions if necessary, e.g., from tensors to NumPy arrays
predictions = predictions.cpu().numpy()  # Move to CPU before converting to NumPy

# If your model is for regression, predictions will be continuous values.
# For classification, you may need to apply a threshold or use the softmax function.


In [64]:
calRMSLE(y_test, predictions)

1.0880315

In [91]:
df_test.isnull().sum()

id                      0
Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Vehicle Age             0
Credit Score            0
Insurance Duration      0
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
dtype: int64

In [65]:
test_df = pd.read_csv('test.csv')

In [66]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
objs = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
df_test[objs] = df_test[objs].apply(label_encoder.fit_transform)


In [70]:
# Apply the logic to fill missing values
proc_df = df_test.apply(
    lambda col: col.fillna(col.mean()) if col.dtypes in ['int64', 'float64'] 
    else col.fillna(col.mode().iloc[0]) if col.dtypes == 'object' or col.nunique() < 10
    else col,
    axis=0
)


In [90]:
df_test = df_test.apply(lambda col: col.fillna(col.mean()) if col.dtype != 'object' else col, axis=0)

In [93]:
df1 = df_test.drop('id', axis=1)

In [94]:
# Switch to evaluation mode
model.eval()

# Assume X_test is the data you want to make predictions on
# Move X_test to the same device as the model


df1 = torch.from_numpy(df1.to_numpy()).float()
df1 = df1.to(device)

# Use torch.no_grad() to turn off gradients
with torch.no_grad():
    # Pass the test data through the model
    predictions = model(df1)


predictions = predictions.cpu().numpy()


In [95]:
predictions = predictions.flatten()


In [96]:
final_df = pd.DataFrame({
    'id' : df_test['id'].tolist(),
    'Premium Amount': predictions
})

In [97]:
final_df.isnull().sum()

id                0
Premium Amount    0
dtype: int64

In [99]:
final_df.to_csv('submission1.csv', index=False)