In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn import preprocessing 
import torch.nn as nn

label_encoder = preprocessing.LabelEncoder()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print("Table names: ", filename)
        fullPath = os.path.join(dirname, filename)
        
        
        if filename == "train.csv":
            trainDS = pd.read_csv(fullPath)
        if filename == "test.csv":
            testDS = pd.read_csv(fullPath)
        
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Table names:  sample_submission.csv
Table names:  train.csv
Table names:  test.csv


In [2]:
## using hardware accelerators
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using GPU: ", torch.cuda.get_device_name(0))
elif 'XLA_USE_BF16' in os.environ:  # This is how TPUs are usually detected in Kaggle
    import torch_xla.core.xla_model as xm
    device = xm.xla_device()
    print("Using TPU")
else:
    device = torch.device('cpu')
    print("Using CPU")

Using CPU


In [3]:
## The plan
#Split up engine type into the following columns
#HP, engine size (without the Liter), cylinder number (sometimes says X cylinder, sometimes VX or IX),fuel type

#encode brand, model, fuel_type, ext_col, int_col, accident, clean_title

#label is obviously price
print(trainDS.head(1))

   id brand          model  model_year  milage fuel_type  \
0   0  MINI  Cooper S Base        2007  213000  Gasoline   

                                         engine transmission ext_col int_col  \
0  172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel          A/T  Yellow    Gray   

        accident clean_title  price  
0  None reported         Yes   4200  


In [4]:
#Taking a look at what engines have HP in them or not
'''## Data preprocessing block (from above plan)
#print(trainDS.head(20))
print(trainDS["engine"].value_counts())
print(len(trainDS) - trainDS['engine'].str.contains("HP").sum())
print(trainDS['engine'].str.contains("HP").sum())

unique_non_occurrences = trainDS[~trainDS['engine'].str.contains('HP')]['engine'].unique()
print(unique_non_occurrences)'''

'## Data preprocessing block (from above plan)\n#print(trainDS.head(20))\nprint(trainDS["engine"].value_counts())\nprint(len(trainDS) - trainDS[\'engine\'].str.contains("HP").sum())\nprint(trainDS[\'engine\'].str.contains("HP").sum())\n\nunique_non_occurrences = trainDS[~trainDS[\'engine\'].str.contains(\'HP\')][\'engine\'].unique()\nprint(unique_non_occurrences)'

In [5]:
def preprocess(df):
    # Ensure the input is a DataFrame
    if not isinstance(df, pd.DataFrame):
        print("Variable is not a dataframe.")
        raise SystemExit("Stopping execution due to incorrect input.")

    # Check if 'engine' column exists
    columnSet = ["engine"]
    if set(columnSet).issubset(df.columns):
        print("'engine' column found, processing...")
        
        # Split the 'engine' column
        try:
            columnAdditions = df["engine"].str.split('HP', expand=True)
            intermVar = columnAdditions[1].str.split("L", expand=True)
            columnAdditions = pd.concat([columnAdditions, intermVar], axis=1)

            # Cleanup column names
            columnAdditions.columns = [0, 1, 2, 3]  # Give temporary names
            columnAdditions = columnAdditions.rename(columns={0: "HP", 2: "engine_size"})
            columnAdditions = columnAdditions.drop([1, 3], axis=1)  # Drop unnecessary columns

            # Convert to numeric values
            columnAdditions["HP"] = pd.to_numeric(columnAdditions["HP"], errors='coerce')
            columnAdditions["engine_size"] = pd.to_numeric(columnAdditions["engine_size"], errors='coerce')

            # Merge back with the original DataFrame
            newdf = pd.concat([df, columnAdditions], axis=1)
            newdf = newdf.drop("engine", axis=1)  # Drop the original 'engine' column

            print("Processing complete. Returning new DataFrame.")
            return newdf

        except KeyError as e:
            print(f"KeyError: {e} - One of the expected columns is missing.")
        except Exception as e:
            print(f"An error occurred: {e}")

    else:
        print("'engine' column is not in the DataFrame (it might have already been processed).")
        return df  # Return the original DataFrame if no processing was done


In [6]:
def encodeData(df, columnSet):
    #make sure ds is dataframe
    if isinstance(df, pd.DataFrame):
        pass
    else:
        print("Variable is not dataframe")
        #raise SystemExit("Stop right there!")
       
    #make sure column names are all valid 
    if set(columnSet).issubset(df.columns):
        pass
    else:
        print("Col list invalid")
        #raise SystemExit("Stop right there!")
    
    #encode all specified columns
    for column in columnSet:
        df[column] = label_encoder.fit_transform(df[column])
    
    return df

In [7]:
#load and process data
batch_size = 32

class carDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        
        #call engine preprocess method to preprocess the engines (split into HP and engine size)
        self.df = preprocess(self.df)
        self.df = encodeData(self.df, ["model", "brand", "fuel_type", "transmission", "ext_col", "int_col", "accident", "clean_title"])
        
        self.df = self.df.replace(np.nan,0)
        
        self.labels = self.df[["price"]]   #y is just the price (this for )
        self.features = self.df.drop(["price"], axis = 1)  #x is dataframe with price column dropped
        
        self.features = (self.features - self.features.mean())/self.features.std() 
            
    def __getitem__(self, idx):
        # Convert the features and labels to PyTorch tensors
        features_tensor = torch.tensor(self.features.iloc[idx], dtype=torch.float32).to(device)
        labels_tensor = torch.tensor(self.labels.iloc[idx], dtype=torch.float32).to(device)
        
        return features_tensor, labels_tensor
 
    
    def __len__(self):
        return len(self.features)
    
carDataset = carDataset(trainDS)
trainer = DataLoader(carDataset, batch_size = batch_size, shuffle = True)

'engine' column found, processing...
Processing complete. Returning new DataFrame.


In [8]:
'''#initial random trees 

import ydf
learner = ydf.GradientBoostedTreesLearner(
    num_trees=15,
    label="price",
    min_examples = 3,
    random_seed = 1,
)

trainDS = encodeData(preprocess(trainDS), ["model", "brand", "fuel_type", "transmission", "ext_col", "int_col", "accident", "clean_title"])
#print(trainDS.head)
model = learner.train(trainDS)
testDS = encodeData(preprocess(testDS), ["model", "brand", "fuel_type", "transmission", "ext_col", "int_col", "accident", "clean_title"])
predict = model.predict(testDS)
model.describe()'''

'#initial random trees \n\nimport ydf\nlearner = ydf.GradientBoostedTreesLearner(\n    num_trees=15,\n    label="price",\n    min_examples = 3,\n    random_seed = 1,\n)\n\ntrainDS = encodeData(preprocess(trainDS), ["model", "brand", "fuel_type", "transmission", "ext_col", "int_col", "accident", "clean_title"])\n#print(trainDS.head)\nmodel = learner.train(trainDS)\ntestDS = encodeData(preprocess(testDS), ["model", "brand", "fuel_type", "transmission", "ext_col", "int_col", "accident", "clean_title"])\npredict = model.predict(testDS)\nmodel.describe()'

In [9]:
seed = 0
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [10]:
class usedCarModel(nn.Module):
    def __init__(self, input_size, batch_size):
        super(usedCarModel, self).__init__()
        self.input_size = input_size
        self.batch_size = batch_size
        
        # Define the layers
        self.fc1 = nn.Linear(input_size, batch_size)  # First hidden layer
        self.fc2 = nn.Linear(batch_size, 100)          # Second hidden layer
        self.fc3 = nn.Linear(100, 1) # Output layer
        
        # Activation function
        self.relu = nn.ReLU()

    def forward(self, x):
        # Forward pass
        x = x.to(device)
        #x = torch.Flatten(input)
        x = self.relu(self.fc1(x))  # First layer + activation
        x = self.relu(self.fc2(x))  # Second layer + activation
        x = self.fc3(x)             # Output layer
        #print("Average X output", sum(x)/len(x))
        return x


#model / hyperparameter definitions
num_epochs = 18
learning_rate = 0.01
input_size = trainDS.shape[1]
model = usedCarModel(input_size, batch_size).to(device)
criterion = nn.MSELoss()
opt = optim.Adam(model.parameters(), lr = learning_rate)

In [11]:
#Reduce LR Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, 'min', factor = 0.7, patience = 2)

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    total_output = []
    
    
    for batch_idx, (inputs, labels) in enumerate(trainer):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs).to(device)
       
        total_output.append(outputs.cpu().detach().numpy())
        
        loss = criterion(outputs, labels.float())
                         
        # Backward pass and optimization
        opt.zero_grad()
        loss.backward()
        #opt.step()
        scheduler.step(learning_rate)

        # Accumulate loss
        running_loss += loss.item()

        # Calculate precision and recall

    # Calculate average loss and accuracy for the epoch
    epoch_loss = running_loss / len(trainer)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, learning rate {learning_rate}')
    
    mean_epoch_output = sum(outputs.cpu().detach().numpy()) / len(outputs.detach().numpy())
    
total_output = np.concatenate(total_output)

print("Mean epoch output: ",mean_epoch_output)

  features_tensor = torch.tensor(self.features.iloc[idx], dtype=torch.float32).to(device)
  labels_tensor = torch.tensor(self.labels.iloc[idx], dtype=torch.float32).to(device)


Epoch [1/18], Loss: 8137427825.2926, learning rate 0.01
Epoch [2/18], Loss: 8137478846.6857, learning rate 0.01
Epoch [3/18], Loss: 8137602058.5906, learning rate 0.01
Epoch [4/18], Loss: 8137450637.5451, learning rate 0.01
Epoch [5/18], Loss: 8137575809.7271, learning rate 0.01
Epoch [6/18], Loss: 8137668742.1697, learning rate 0.01
Epoch [7/18], Loss: 8137550388.6273, learning rate 0.01
Epoch [8/18], Loss: 8137843853.1107, learning rate 0.01
Epoch [9/18], Loss: 8148037753.9498, learning rate 0.01
Epoch [10/18], Loss: 8137640875.3510, learning rate 0.01
Epoch [11/18], Loss: 8137349224.1684, learning rate 0.01
Epoch [12/18], Loss: 8137528237.9036, learning rate 0.01
Epoch [13/18], Loss: 8137408061.9688, learning rate 0.01
Epoch [14/18], Loss: 8138949988.6056, learning rate 0.01
Epoch [15/18], Loss: 8144464065.1731, learning rate 0.01
Epoch [16/18], Loss: 8137672116.2580, learning rate 0.01
Epoch [17/18], Loss: 8137507881.6022, learning rate 0.01
Epoch [18/18], Loss: 8137394683.0794, le