In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("datasets/housing/housing.csv")

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
def shuffle_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    print(f"shuffled_indices = {shuffled_indices}")
    test_set_size = int(len(data) * test_ratio)
    print(f"test_set_size = {test_set_size}")
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [5]:
train_set, test_set = shuffle_split_data(df, 0.2)

shuffled_indices = [17231 15916    74 ...   157  2781  3175]
test_set_size = 4128


In [6]:
len(train_set)

16512

In [7]:
len(test_set)

4128

In [8]:
np.random.permutation(5)

array([1, 2, 4, 0, 3], dtype=int32)

In [9]:
from zlib import crc32

In [10]:
import zlib

# Data to compute the CRC-32 checksum for (must be bytes)
data = b"This is some data to calculate the CRC-32 checksum."

# Compute the CRC-32 checksum
checksum = zlib.crc32(data)

print(f"CRC-32 checksum: {checksum}")

CRC-32 checksum: 524592177


In [11]:
def is_id_in_test_set(identifier, test_ratio):
    print("crc function output = ,",crc32(np.int64(identifier)) < test_ratio * 2**32)
    return crc32(np.int64(identifier)) < test_ratio * 2**32

def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    print(f"ids = {ids}")
    in_test_set = ids.apply(lambda id_ :is_id_in_test_set(id_,test_ratio))
    print(f"in_test_set = {in_test_set}")
    return data.loc[~in_test_set], data.loc[in_test_set]

In [12]:
housing_with_id = df.reset_index()

In [13]:
#train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2,"index")

In [14]:
len(train_set)

16512

In [15]:
len(test_set)

4128

In [16]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [17]:
housing_with_id["id"] = df["longitude"] * 1000 + df["latitude"]
#train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2 ,"id")

In [18]:
X, y = np.arange(10).reshape((5, 2)), range(5)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
print(f"x = {X}")
print(f"y = {y}")
print(f"x_train = {X_train}")
print(f"x_test = {X_test}")
print(f"y_train = {y_train}")
print(f"y_test = {y_test}")

x = [[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
y = range(0, 5)
x_train = [[4 5]
 [0 1]
 [6 7]]
x_test = [[2 3]
 [8 9]]
y_train = [2, 0, 3]
y_test = [1, 4]
