# Preliminary Imports

In [24]:
import pandas as pd
import numpy as np
from random import randrange
from itertools import product

# Checking Out Some Statistics of the Data Set

In [18]:
df = pd.read_csv("childcare_costs.csv")
df.index.to_series().map(lambda i: df.iloc[i].isna().sum() / len(df.iloc[i])).to_csv("nancount.csv")

In [20]:
count = 0
for col in df.columns:
    count += df[col].isna().sum()
print(count / (df.shape[0] * df.shape[1]))
    

0.042035732933950555


Grouping the attributes by datatype, we can count the number of attributes for each datatype.

In [None]:
pd.DataFrame(df.dtypes, columns = ["Type"]).groupby(by=["Type"])['Type'].count()

# k-Means Clustering Algorithm

## Preliminaries

I am using a small data set to test on. This data corresponds to soybean data.

In [2]:
df = pd.read_csv("soybean_data.csv", index_col=0)

## Adding Clusters

In [3]:
def addClusters(df, k):
  new_df = pd.DataFrame(df.to_dict(orient='series') | {"Cluster": [randrange(k) for i in range(df.shape[0])]})
  return new_df

In [4]:
#Test addClusters
df_1 = addClusters(df, 4)
# new_df.to_csv("WithClusters.csv")

## Find Centroids

In [5]:
def centroids(df):
    return df.groupby(by=["Cluster"]).mean()

In [6]:
df_2 = centroids(df_1)
print(df_2.head())

             Date  Plant-Stand    Precip      Temp      Hail  Crop-Hist  \
Cluster                                                                   
0        2.230769     0.538462  1.461538  0.538462  0.153846   1.230769   
1        3.111111     0.555556  1.555556  0.777778  0.111111   1.555556   
2        2.833333     0.500000  1.500000  1.000000  0.666667   2.166667   
3        2.769231     0.538462  1.461538  0.769231  0.307692   1.923077   

         Area-Damaged  Severity  Seed-TMT  Germination  ...  Int-Discolor  \
Cluster                                                 ...                 
0            1.153846  1.461538  0.538462     1.000000  ...      0.461538   
1            0.777778  1.555556  0.666667     1.222222  ...      0.222222   
2            1.583333  1.250000  0.416667     1.250000  ...      0.500000   
3            1.230769  1.384615  0.384615     1.076923  ...      0.461538   

         Sclerotia  Fruit-Pods  Fruit Spots  Seed  Mold-Growth  Seed-Discolor  \
Clust

## Distance Matrix

In [49]:
def distMatrix(df, cluster_df):
    df_mat, clus_mat = df.to_numpy(), cluster_df.to_numpy()
    return np.array([[np.linalg.norm(df_mat[i, :] - clus_mat[j, :]) for j in range(clus_mat.shape[0])] for i in range(df_mat.shape[0])])

In [50]:
print(distMatrix(df, df_2))

[[4.01844858 3.14466038 3.86400771 3.64797099]
 [4.97627507 4.01386486 4.35092583 4.23356551]
 [3.83228268 3.09120617 3.94933187 3.53009043]
 [5.10597747 4.24264069 4.78858597 4.574175  ]
 [4.3138692  3.2829526  3.88551441 3.66899693]
 [4.61538462 3.65148372 4.31244968 4.03827838]
 [3.77158482 3.23178657 3.59590817 3.28165062]
 [3.89203404 3.19722102 4.17499168 3.76215977]
 [5.34158632 4.18993503 4.78858597 4.64095481]
 [5.15097509 4.0824829  4.89188671 4.67398693]
 [5.85020103 5.79271573 5.48609353 5.56085218]
 [4.51427715 4.71404521 4.05346217 4.22447084]
 [5.2690904  5.07718207 4.46436508 4.69041576]
 [5.97380871 5.8214164  5.18946583 5.42784842]
 [4.33166407 4.77260702 4.29308229 4.37651073]
 [4.62371044 4.91030662 4.19490432 4.39405192]
 [3.96060482 4.45969605 3.99131    3.95163063]
 [5.17332713 5.06622805 4.37003687 4.63265998]
 [5.81723617 5.72518801 5.37871319 5.49825147]
 [5.57413724 5.44671155 4.64728116 5.01536102]
 [3.11229667 4.2031734  4.17499168 3.81293346]
 [2.7021797  