# Imports

In [1]:
import os

import pandas as pd
import numpy as np

In [4]:
from sklearn import model_selection
from sklearn.datasets import fetch_california_housing

In [5]:
SEED = 42

# Paths

Here we set a relative path so that we can easily export this notebook from Kaggle, change the path and run it locally.

In [6]:
RELATIVE_PATH = "../data"

# Parameters

In [16]:
MERGED_DF_PATH = "../data/train_folds.csv"

# Data

The dataset for this competition (both `train` and `test`) was generated from a deep learning model trained on the California Housing Dataset.
Feature distributions are close to, but not exactly the same, as the original.

We will use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.

## Loading the Data

### Data from the Competition

In [8]:
data = pd.read_csv(os.path.join(RELATIVE_PATH, "train.csv"))
data = data.drop(columns="id", inplace=False)

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37137 entries, 0 to 37136
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       37137 non-null  float64
 1   HouseAge     37137 non-null  float64
 2   AveRooms     37137 non-null  float64
 3   AveBedrms    37137 non-null  float64
 4   Population   37137 non-null  float64
 5   AveOccup     37137 non-null  float64
 6   Latitude     37137 non-null  float64
 7   Longitude    37137 non-null  float64
 8   MedHouseVal  37137 non-null  float64
dtypes: float64(9)
memory usage: 2.6 MB


### Original

In [9]:
original_data = fetch_california_housing()
original_data = pd.DataFrame(
    data=np.hstack([original_data["data"], original_data["target"].reshape(-1, 1)]),
    columns=data.columns,
)

original_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


## Merging Data

In [10]:
data.insert(0, "Source", "competition")
original_data.insert(0, "Source", "original")

In [11]:
merged_data = pd.concat([data, original_data]).reset_index(drop=True)
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57777 entries, 0 to 57776
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Source       57777 non-null  object 
 1   MedInc       57777 non-null  float64
 2   HouseAge     57777 non-null  float64
 3   AveRooms     57777 non-null  float64
 4   AveBedrms    57777 non-null  float64
 5   Population   57777 non-null  float64
 6   AveOccup     57777 non-null  float64
 7   Latitude     57777 non-null  float64
 8   Longitude    57777 non-null  float64
 9   MedHouseVal  57777 non-null  float64
dtypes: float64(9), object(1)
memory usage: 4.4+ MB


# Creating Folds

In [12]:
merged_data.insert(1, "KFold", "0")

kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=SEED)

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=merged_data)):
    merged_data.loc[valid_indicies, "KFold"] = fold

merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57777 entries, 0 to 57776
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Source       57777 non-null  object 
 1   KFold        57777 non-null  object 
 2   MedInc       57777 non-null  float64
 3   HouseAge     57777 non-null  float64
 4   AveRooms     57777 non-null  float64
 5   AveBedrms    57777 non-null  float64
 6   Population   57777 non-null  float64
 7   AveOccup     57777 non-null  float64
 8   Latitude     57777 non-null  float64
 9   Longitude    57777 non-null  float64
 10  MedHouseVal  57777 non-null  float64
dtypes: float64(9), object(2)
memory usage: 4.8+ MB


# Saving New Dataframe

In [17]:
merged_data.to_csv(MERGED_DF_PATH, index=False)