# Preliminary Imports

In [57]:
import pandas as pd
import numpy as np
from random import randrange
from functools import reduce
from tail_recursive import tail_recursive

# Checking Out Some Statistics of the Data Set

In [18]:
df = pd.read_csv("childcare_costs.csv")
df.index.to_series().map(lambda i: df.iloc[i].isna().sum() / len(df.iloc[i])).to_csv("nancount.csv")

In [20]:
count = 0
for col in df.columns:
    count += df[col].isna().sum()
print(count / (df.shape[0] * df.shape[1]))
    

0.042035732933950555


Grouping the attributes by datatype, we can count the number of attributes for each datatype.

In [None]:
pd.DataFrame(df.dtypes, columns = ["Type"]).groupby(by=["Type"])['Type'].count()

# k-Means Clustering Algorithm

## Preliminaries

I am using a small data set to test on. This data corresponds to soybean data.

In [3]:
df = pd.read_csv("soybean_data.csv", index_col=0)

## Create Distance Matrix Function

In [8]:
def distMatrix(df, centroids):
    df_mat, cent_mat = df.to_numpy(), centroids.to_numpy()
    return np.array([[np.linalg.norm(df_mat[i, :] - cent_mat[j, :]) for j in range(cent_mat.shape[0])] for i in range(df_mat.shape[0])])

## Assign New Clusters

In [31]:
def cluster(dm):
    def f(i):
        return reduce(lambda x, y: x if x[0] <= y[0] else y, zip(dm[i, :], range(len(dm[i, :]))))[1]
    return f

## Bringing It All Together

In [66]:
def kmeans(df, k, eps):
    @tail_recursive
    def go(currentClus, prevClus, r):
        print("Iteration Number {}".format(r))
        centroids = pd.DataFrame(df.to_dict(orient='series') | {"Cluster": currentClus}).groupby(by=["Cluster"]).mean()
        if prevClus is not None and ((currentClus != prevClus).sum() / len(currentClus)) < eps:
            return centroids
        else:
            dm = distMatrix(df, centroids)
            return go.tail_call(df.index.map(cluster(dm)), currentClus, r + 1)
    return go([randrange(k) for i in range(df.shape[0])], None, 0)

In [67]:
kmeans(df, 4, 0.001)

Unnamed: 0_level_0,Date,Plant-Stand,Precip,Temp,Hail,Crop-Hist,Area-Damaged,Severity,Seed-TMT,Germination,...,Int-Discolor,Sclerotia,Fruit-Pods,Fruit Spots,Seed,Mold-Growth,Seed-Discolor,Seed-Size,Shriveling,Roots
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,0.333333,2.0,0.333333,1.0,1.333333,1.0,2.0,0.0,1.666667,...,0.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.333333
1,1.083333,1.0,1.833333,0.375,0.208333,1.75,1.083333,1.541667,0.541667,1.083333,...,0.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.708333
2,4.5,0.0,2.0,1.0,0.1,1.9,0.3,1.3,0.5,1.3,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.7,0.0,0.0,1.6,0.6,1.6,2.5,1.0,0.5,0.9,...,2.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
