In [29]:
import subprocess
import os

def check_colab():
    """Function to check if we are running on colab. Install packages if we are."""
    try:
        import google.colab
        IN_COLAB = True
        print("Running on Colab")
        results = subprocess.run(["pip", "install", "-r", "requirements.txt"], check=True, capture_output=True)
        # Check if the installation was successful
        if results.returncode == 0:
            print("Installation successful")
            print("You may need to restart the runtime for the changes to take effect")
        else:
            print("Installation failed")
            print(results.stdout)
        
    except:
        IN_COLAB = False
        print("Not running on Colab")
    return IN_COLAB


check_colab();

Not running on Colab


In [57]:
import numpy as np 
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms, utils
import matplotlib.pyplot as plt




TARGET = 'MedHouseVal'
COMPETITION = 'playground-series-s3e1'

def train_val_split(input_tensors,targets, val_size=0.1, random_seed=0, true_random=False):
    """Splits the input tensors into train and validation sets. 
    Returns the train and validation sets as tensors.
    """
    num_samples = input_tensors.shape[0]
    indices = list(range(num_samples))
    split = int(np.floor(val_size * num_samples))
    if true_random:
        random_seed = np.random.randint(0, 1000)
    np.random.seed(random_seed)
    np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]
    train_tensors = input_tensors[train_indices]
    val_tensors = input_tensors[val_indices]
    train_targets = targets[train_indices]
    val_targets = targets[val_indices]

    print("Train Size:", train_tensors.shape)
    print("Val Size:", val_tensors.shape)

    if train_tensors.shape[0] + val_tensors.shape[0] != input_tensors.shape[0]:
        raise ValueError("Train and val sizes don't add up to input size")

    if train_tensors.shape[0] != train_targets.shape[0]:
        raise ValueError("Train tensors and targets don't match")

    return train_tensors, val_tensors, train_targets, val_targets

def load_data(target):
    # Load the data, turn it into tensors
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    print("Training Size:", train_df.shape)
    print("Test Size:", test_df.shape)

    FEATURES = [col for col in train_df.columns if col not in ['id', TARGET]]
    print("Num Features:", len(FEATURES))

    train_tensors = torch.tensor(train_df[FEATURES].values, dtype=torch.float32)
    target_tensors = torch.tensor(train_df[TARGET].values, dtype=torch.float32)
    test_tensors = torch.tensor(test_df[FEATURES].values, dtype=torch.float32)
    return train_tensors, target_tensors, test_tensors

def get_device(whacky_mode = False):
    # Get the device to train on
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available() and whacky_mode:
        return torch.device("mps")
    else:
        return torch.device("cpu")

def torch_standardize(x):
    mean = torch.mean(x, dim=0)
    std = torch.std(x, dim=0)
    return ((x - mean) / std, mean, std)

def plot_loss(**kwargs):
    """Plots the loss and accuracy of the model"""
    # Get loss and accuracy from kwargs
    loss_vals_ = kwargs.get('loss_vals', loss_vals)

    # Get epochs to plot
    total_epochs_shown = kwargs.get('epochs', 0)

    loss_vals_ = loss_vals_[-total_epochs_shown:]
    
    plt.plot(loss_vals_, label="Loss", color="green", linestyle="dashed", marker="o")
    plt.legend()
    # Add title to x axis
    plt.xlabel("Epochs")
    # Add title to y axis
    plt.ylabel("Loss")
    # Add title to graph
    plt.title("Loss vs Epochs")
    # Only show integer epochs 
    # Set y-axis to log scale
    plt.yscale("log")

    # Plot accuracy in a seperate plot 
    plt.figure()
    # Plot

# Preprocess the data
def preprocess(train_tensors, val_tensors, test_tensors, include_original=True):
    """Function for preprocessing the data before training. Currently normalizing,
    standardizing, and keeping the original values for the neural network. We 
    will pass all three concatenated to our model, and let it decide which ones
    to use.
    We will L2, L1 normalize and min/max normalize the features of x.
    For now we will focus on standardizing the data, and later we will add more
    """
    original_train_tensors = train_tensors
    original_val_tensors = val_tensors
    original_test_tensors = test_tensors

    train_tensors, train_mean, train_std = torch_standardize(train_tensors)
    val_tensors = (val_tensors - train_mean) / train_std
    test_tensors = (test_tensors - train_mean) / train_std

    if include_original:
        train_tensors = torch.cat((train_tensors, original_train_tensors), dim=1)
        val_tensors = torch.cat((val_tensors, original_val_tensors), dim=1)
        test_tensors = torch.cat((test_tensors, original_test_tensors), dim=1)
    
    return train_tensors, val_tensors, test_tensors

def submit_kaggle(**kwargs):
    """Function for submitting a file to a kaggle competition. The function
    will return the output of the kaggle cli command as a string."""

    kaggle_cli = kwargs.get("kaggle_cli",'/Users/dbless/Library/Python/3.11/bin/kaggle')
    competition = kwargs.get("competition","playground-series-s3e1")
    submission = kwargs.get("submission","submission.csv")
    message = kwargs.get("message","Statistics may be dull, but it has its moments.")

    result = subprocess.run(['./submit_kaggle.sh',kaggle_cli,competition,submission,message],cwd=os.getcwd(), capture_output=True, text=True)

    if hasattr(result, 'stderr'):
        print(result.stderr)
        
    if result.returncode == 0:
        print("Submission successful")
        
    if hasattr(result, 'stdout'):
        print(result.stdout)

    return result

class HousingDataset(Dataset):
    """Housing dataset."""

    def __init__(self, x, y=None):
        self.x = x
        self.y = y if y is not None else None

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx] if self.y is not None else torch.empty((1, 1), dtype=torch.float32)

def main_load(*args,**kwargs):

    if kwargs.get("whacky_mode",False):
        print("Whacky mode activated")

        train_dataset = TensorDataset(torch.tensor(X.values, dtype=torch.float32), torch.tensor(y.values, dtype=torch.float32))
        test_dataset = TensorDataset(torch.tensor(X_test.values, dtype=torch.float32))
        # Split train dataset
        train_size = int(0.8 * len(train_dataset))
        val_size = len(train_dataset) - train_size
        train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])
        # Data loaders
        # Get batch size if in kwargs
        batch_size = kwargs.get('batch_size', 2500)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)      
        print("Train dataloader batch size", train_loader.batch_size)
        # Wrap outputs in a dictionary
        output = {
            'train_tensors': X,
            'val_tensors': y,
            'test_tensors': X_test,
            'train_dataset': train_dataset,
            'val_dataset': val_dataset,
            'test_dataset': test_dataset,
            "train_loader": train_loader,
            "val_loader": val_loader,
            "test_loader": test_loader
        }
        return output


        

    # Get target if in kwargs
    target = kwargs.get('target', TARGET)
    train_tensors, target_tensors, test_tensors = load_data(target)

    # Get val_size, random seed and true random if in kwargs
    val_size = kwargs.get('val_size', 0.2)
    random_seed = kwargs.get('random_seed', 42)
    true_random = kwargs.get('true_random', False)
    train_tensors, val_tensors, train_targets, val_targets = train_val_split(train_tensors, target_tensors, val_size, random_seed, true_random)
    include_original = kwargs.get('include_original', False)
    train_tensors, val_tensors, test_tensors = preprocess(train_tensors, val_tensors, test_tensors,include_original=include_original)
    train_dataset = TensorDataset(train_tensors, train_targets)
    val_dataset = TensorDataset(val_tensors, val_targets)
    test_dataset = TensorDataset(test_tensors)

    # Data loaders
    # Get batch size if in kwargs
    batch_size = kwargs.get('batch_size', 2500)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    print("Train dataloader batch size", train_loader.batch_size)

    # Wrap outputs in a dictionary
    output = {
        'train_tensors': train_tensors,
        'val_tensors': val_tensors,
        'train_targets': train_targets,
        'val_targets': val_targets,
        'test_tensors': test_tensors,
        'train_dataset': train_dataset,
        'val_dataset': val_dataset,
        'test_dataset': test_dataset,
        "train_loader": train_loader,
        "val_loader": val_loader,
        "test_loader": test_loader
    }
    return output


In [53]:
import pandas as pd
import numpy as np
from umap import UMAP
from sklearn.decomposition import PCA


train_df = pd.read_csv('train.csv')
train_df['r'] = np.sqrt(train_df['Latitude']**2 + train_df['Longitude']**2)
train_df['theta'] = np.arctan2(train_df['Latitude'], train_df['Longitude'])
test_df = pd.read_csv('test.csv')
test_df['r'] = np.sqrt(test_df['Latitude']**2 + test_df['Longitude']**2)
test_df['theta'] = np.arctan2(test_df['Latitude'], test_df['Longitude'])

emb_size = 20
precision = 1e6 
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

latlon = np.expand_dims(df[['Latitude', 'Longitude']].values, axis=-1) 

m = np.exp(np.log(precision) / emb_size) 
angle_freq = m ** np.arange(emb_size) 
angle_freq = angle_freq.reshape(1, 1, emb_size) 

latlon = latlon * angle_freq 
latlon[..., 0::2] = np.cos(latlon[..., 0::2]) 
latlon[..., 1::2] = np.sin(latlon[..., 1::2]) 
latlon = latlon.reshape(-1, 2 * emb_size) 

df['exp_latlon1'] = [lat[0] for lat in latlon]
df['exp_latlon2'] = [lat[1] for lat in latlon]


def pca(data):
    '''
    input: dataframe containing Latitude(x) and Longitude(y)
    '''
    coordinates = data[['Latitude','Latitude']].values
    pca_obj = PCA().fit(coordinates)
    pca_x = pca_obj.transform(data[['Latitude', 'Longitude']].values)[:,0]
    pca_y = pca_obj.transform(data[['Latitude', 'Longitude']].values)[:,1]
    return pca_x, pca_y

# train_df['pca_x'], train_df['pca_y'] = pca(train_df)
# test_df['pca_x'], test_df['pca_y'] = pca(test_df)
df['pca_x'], df['pca_y'] = pca(df)
def crt_crds(df): 
    df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + \
                      (np.sin(np.radians(15)) * df['Latitude'])
    
    df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) + \
                      (np.sin(np.radians(15)) * df['Longitude'])
    
    df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + \
                      (np.sin(np.radians(30)) * df['Latitude'])
    
    df['rot_30_y'] = (np.cos(np.radians(30)) * df['Latitude']) + \
                      (np.sin(np.radians(30)) * df['Longitude'])
    
    df['rot_45_x'] = (np.cos(np.radians(45)) * df['Longitude']) + \
                      (np.sin(np.radians(45)) * df['Latitude'])
    return df

# train_df = crt_crds(train_df)
# test_df = crt_crds(test_df)
df = crt_crds(df)
import reverse_geocoder as rg
from sklearn.preprocessing import LabelEncoder

def geocoder(df):
    coordinates = list(zip(df['Latitude'], df['Longitude']))
    results = rg.search(coordinates)
    return results

# results = geocoder(train_df)
# train_df['place'] = [x['admin2'] for x in results]
# results = geocoder(test_df)
# test_df['place'] = [x['admin2'] for x in results]

results = geocoder(df)
df['place'] = [x['admin2'] for x in results]

places = ['Los Angeles County', 'Orange County', 'Kern County',
          'Alameda County', 'San Francisco County', 'Ventura County',
          'Santa Clara County', 'Fresno County', 'Santa Barbara County',
          'Contra Costa County', 'Yolo County', 'Monterey County',
          'Riverside County', 'Napa County']

def replace(x):
    if x in places:
        return x
    else:
        return 'Other'
    
# train_df['place'] = train_df['place'].apply(lambda x: replace(x))
# test_df['place'] = test_df['place'].apply(lambda x: replace(x))

df['place'] = df['place'].apply(lambda x: replace(x))

# le = LabelEncoder()
# train_df['place'] = le.fit_transform(train_df['place'])
# test_df['place'] = le.transform(test_df['place'])

# test_df = pd.get_dummies(test_df)
# train_df = pd.get_dummies(train_df)

df = pd.get_dummies(df)
df
from haversine import haversine

Sac = (38.576931, -121.494949)
SF = (37.780080, -122.420160)
SJ = (37.334789, -121.888138)
LA = (34.052235, -118.243683)
SD = (32.715759, -117.163818)

df['dist_Sac'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), Sac, unit='ft'), axis=1)
df['dist_SF'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SF, unit='ft'), axis=1)
df['dist_SJ'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SJ, unit='ft'), axis=1)
df['dist_LA'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), LA, unit='ft'), axis=1)
df['dist_SD'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SD, unit='ft'), axis=1)
df['dist_nearest_city'] = df[['dist_Sac', 'dist_SF', 'dist_SJ', 
                              'dist_LA', 'dist_SD']].min(axis=1)
from shapely.geometry import LineString, Point

coast_points = LineString([(32.6644, -117.1613), (33.2064, -117.3831),
                           (33.7772, -118.2024), (34.4634, -120.0144),
                           (35.4273, -120.8819), (35.9284, -121.4892),
                           (36.9827, -122.0289), (37.6114, -122.4916),
                           (38.3556, -123.0603), (39.7926, -123.8217),
                           (40.7997, -124.1881), (41.7558, -124.1976)])

df['dist_to_coast'] = df.apply(lambda x: Point(x['Latitude'], x['Longitude']).distance(coast_points), axis=1)
# combine latitude and longitude
# codes from 
# https://datascience.stackexchange.com/questions/49553/combining-latitude-longitude-position-into-single-feature
from math import radians, cos, sin, asin, sqrt

def single_pt_haversine(lat, lng, degrees=True):
    """
    'Single-point' Haversine: Calculates the great circle distance
    between a point on Earth and the (0, 0) lat-long coordinate
    """
    r = 6371 # Earth's radius (km). Have r = 3956 if you want miles

    # Convert decimal degrees to radians
    if degrees:
        lat, lng = map(radians, [lat, lng])

    # 'Single-point' Haversine formula
    a = sin(lat/2)**2 + cos(lat) * sin(lng/2)**2
    d = 2 * r * asin(sqrt(a)) 

    return d
# add more metric 
# referred to this discussion
# https://www.kaggle.com/competitions/playground-series-s3e1/discussion/376210

def manhattan(lat,lng):
    return np.abs(lat) + np.abs(lng)
def euclidean(lat,lng):
    return (lat**2 + lng**2) **0.5

def add_combine(df):      
    df['haversine'] = [single_pt_haversine(x, y) for x, y in zip(df.Latitude, df.Longitude)]
    df['manhattan'] = [manhattan(x,y) for x,y in zip(df.Latitude, df.Longitude)]
    df['euclidean'] = [euclidean(x,y) for x,y in zip(df.Latitude,df.Longitude)]
    return df

df = add_combine(df)
df['number_houses_per_block'] = df['Population'] / df['AveOccup']
df['total_income_of_block'] = df['MedInc'] * df['Population']
df['occupants_to_bedrooms'] = df['AveOccup'] / df['AveBedrms']
df['total_number_of_rooms'] = df['AveBedrms'] + df['AveRooms']
df['bedrooms_to_rooms'] = df['AveBedrms'] / df['AveRooms']
df['occupants_to_rooms'] = df['AveOccup'] / df['AveRooms']
train_df = df.iloc[:-len(test_df),:]
test_df = df.iloc[-len(test_df):,:].drop('MedHouseVal', axis=1).reset_index(drop=True)

X = train_df.drop(['MedHouseVal', 'id'], axis=1)
y = train_df.MedHouseVal
X_test = test_df.drop('id', axis=1)

In [60]:
# Define model
class HousePricesModel(nn.Module):
    def __init__(self):
        super(HousePricesModel, self).__init__()
        self.head = nn.Sequential(
            nn.Linear(50, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
        )


    def forward(self, x):
        return self.head(x)
    
def train(model, train_dl, val_dl, optimizer, loss_func, epochs, device):
    loss_vals = []
    print("-"*30)
    print(f"Training model: {model.__class__.__name__}")
    print(f"Optimizer: {optimizer.__class__.__name__}")
    print()
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            xb = xb.to(device)
            yb = yb.to(device)
            y_hat = model(xb)
            y_hat = y_hat.squeeze()
            loss = loss_func(y_hat, yb)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        model.eval()
        with torch.no_grad():
            tot_loss = 0
            for xb, yb in val_dl:
                xb = xb.to(device)
                yb = yb.to(device)
                
                y_hat = model(xb)
                y_hat = y_hat.squeeze()
                loss = loss_func(y_hat, yb)
                tot_loss += loss.item() 
            val_loss = tot_loss / len(val_dl)
            print(f"Epoch: {epoch}")
            print(f"Validation loss: {val_loss}")
            print()
            loss_vals.append(val_loss)

    return loss_vals

def train_model(*args, **kwargs):
    """Function for training a model. We will use this function to train the
    model with the optimal hyperparameters found in the hyperparameter search."""

    # Get model, optimizer, loss function and dataloaders from kwargs, or use default values
    device = kwargs.get('dev', get_device())
    model = kwargs.get('model',HousePricesModel()).to(device)
    lr = kwargs.get('lr', 0.001)
    weight_decay = kwargs.get('weight_decay', 0)
    optimizer = kwargs.get('optimizer',optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay))
    loss_func = kwargs.get('loss_func', nn.MSELoss(reduction='mean'))
    train_dl = kwargs.get('train_dl',data_dict['train_loader'])
    val_dl = kwargs.get('val_dl',data_dict['val_loader'])
    epochs = kwargs.get('epochs', 100)

    # Train model
    input_model_dict = dict(model=model, optimizer=optimizer, loss_func=loss_func, train_dl=train_dl, val_dl=val_dl, epochs=epochs, device=device)

    return train(**input_model_dict),model

# Get model predictions
def make_predictions(model,**kwargs):
    device = kwargs.get('device', get_device())
    test_dl = kwargs.get('test_dl', data_dict['test_loader'])
    model.eval()
    with torch.no_grad():
        tot_loss = 0
        preds = []
        for xb in test_dl:
            xb = xb[0]
            xb = xb.to(device)
            y_hat = model(xb)
            y_hat = y_hat.squeeze()
            preds.append(y_hat)
        preds = torch.cat(preds)
        preds = preds.cpu().numpy()

    # Check preds are correct shape
    correct_num_of_preds = kwargs.get('correct_num_of_preds', data_dict['test_tensors'].shape[0])
    if preds.shape[0] != correct_num_of_preds:
        raise ValueError(f"Expected {correct_num_of_preds} predictions, but got {preds.shape[0]} predictions")
    return preds

def save_predictions(preds, **kwargs):
    # Save predictions to csv file
    filename = kwargs.get('filename', 'submission.csv')
    target = kwargs.get('target', TARGET)
    # Copy Ids from sample submission
    df = pd.read_csv('sample_submission.csv')
    df[target] = preds

    # Make sure we have two columns in df
    assert df.shape[1] == 2, f"Expected df to have 2 columns, but got {df.shape[1]} columns"
    df.to_csv(filename, index=False)
    return

def make_and_save_predictions(model, **kwargs):
    preds = make_predictions(model, **kwargs)
    save_predictions(preds, **kwargs)

    return preds

def main_submit(model,**kwargs):
    make_and_save_predictions(model,**kwargs)
    submit_kaggle(**kwargs)
    return

# Train model
data_dict = main_load(whacky_mode=True)
loss_vals,model = train_model()

Whacky mode activated
Train dataloader batch size 2500
------------------------------
Training model: HousePricesModel
Optimizer: Adam

Epoch: 0
Validation loss: 1883747.3333333333

Epoch: 1
Validation loss: 6482593.333333333

Epoch: 2
Validation loss: 566046.4583333334

Epoch: 3
Validation loss: 101689.7421875

Epoch: 4
Validation loss: 116101.6328125

Epoch: 5
Validation loss: 128595.66145833333

Epoch: 6
Validation loss: 149471.54166666666

Epoch: 7
Validation loss: 156472.453125

Epoch: 8
Validation loss: 48035.817708333336

Epoch: 9
Validation loss: 36201.2421875

Epoch: 10
Validation loss: 16737.3330078125

Epoch: 11
Validation loss: 4497.2529296875

Epoch: 12
Validation loss: 1126.2189127604167

Epoch: 13
Validation loss: 2450.98095703125

Epoch: 14
Validation loss: 1032.3584594726562

Epoch: 15
Validation loss: 572.4054768880209

Epoch: 16
Validation loss: 392.62005615234375

Epoch: 17
Validation loss: 251.44475301106772

Epoch: 18
Validation loss: 148.00963846842447

Epoch: 19

In [61]:
main_submit(model)


  0%|          | 0.00/411k [00:00<?, ?B/s]
  2%|▏         | 8.00k/411k [00:00<00:07, 52.9kB/s]
 76%|███████▌  | 312k/411k [00:00<00:00, 1.49MB/s] 
100%|██████████| 411k/411k [00:02<00:00, 194kB/s] 

Submission successful
Successfully submitted to Playground Series - Season 3, Episode 1
