# Standardise Data

In [2]:
import pandas as pd
import numpy as np

def load_data(file_path):
    """
    Load data from a CSV file and return a DataFrame.
    
    :param file_path: Path to the CSV file
    :return: DataFrame containing the loaded data
    """
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [14]:
raw_data = pd.read_csv("features.csv")

In [15]:
raw_data.head()

Unnamed: 0,zip,stabbr,cty_name,est_per_capita,MEDFAMINC
0,1001,MA,HAMPDEN,0.02923,101722.0
1,1002,MA,HAMPSHIRE,0.021877,138520.0
2,1005,MA,WORCESTER,0.022121,111000.0
3,1007,MA,HAMPSHIRE,0.015292,123932.0
4,1008,MA,HAMPDEN,0.017059,112083.0


In [16]:
std_data = raw_data.copy()

# standardize est_per_capita and MEDFAMINC 
std_data['est_per_capita'] = (raw_data['est_per_capita'] - raw_data['est_per_capita'].mean()) / raw_data['est_per_capita'].std()
std_data['MEDFAMINC'] = (raw_data['MEDFAMINC'] - raw_data['MEDFAMINC'].mean()) / raw_data['MEDFAMINC'].std()

In [17]:
std_data.head()

Unnamed: 0,zip,stabbr,cty_name,est_per_capita,MEDFAMINC
0,1001,MA,HAMPDEN,0.137154,0.318444
1,1002,MA,HAMPSHIRE,-0.095429,1.3408
2,1005,MA,WORCESTER,-0.087712,0.576214
3,1007,MA,HAMPSHIRE,-0.303737,0.935503
4,1008,MA,HAMPDEN,-0.247821,0.606303


In [21]:
std_data.to_csv("features_std.csv", index=True)

# Now let's select some points as initial points for clustering:

In [24]:
k_values = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ]
for k in k_values:
    # select k rows from the standardized data
    selected_data = std_data.sample(n=k, random_state=k)
    # reset the index of the selected data
    selected_data.reset_index(drop=True, inplace=True)
    # save the selected data to a new CSV file
    selected_data.to_csv(f"features_std_k{k}.csv", index=True)

# there are too many datapoints, let's make a subset to run the algorithm on:

In [None]:
sizes = [50, 100, 200, 500, 1000]
for size in sizes:
    # select a random sample of the specified size from the standardized data
    sampled_data = std_data.sample(n=size, random_state=size)
    # save the sampled data to a new CSV file
    sampled_data.to_csv(f"features_std_sample_{size}.csv", index=True)