#Data Cleaning Process

In [None]:
import pandas as pd
import numpy as np
import glob as gl
import matplotlib.pyplot as plt

In [None]:
extension = "csv"
file_names = np.array(["air_quality.csv", "car_insurance_fees.csv", "car_registration_fees.csv", "cost_of_living.csv","crime_arsen.csv", "debt.csv", "education_rank.csv", "employment.csv", "ethnicity.csv", "homeless.csv","internet_speed.csv", "labor_force.csv", "population.csv", "recreation.csv", "taxes.csv", "test_scores.csv"])

files = np.array(['https://raw.githubusercontent.com/Erivera96/MachineLearning/master/ClassifyingStates/MessyData/' + fn for fn in file_names])

df = []
for i in range(0,len(file_names)):
    daframe = pd.read_csv(files[i]).sort_values(by=['State'],ascending=True, axis=0)
    indices = list(daframe.index)
    dic = dict(zip(indices, daframe['State'][indices]))
    daframe = daframe.rename(index=dic)
    df.append(daframe)
states = pd.concat(df, axis=1,sort=False)
states = states.drop("District of Columbia", axis=0)
states = states.reset_index()
states = states.loc[:,~states.columns.duplicated()]
states = states.drop(['index','avgCost', 'registrationNote', 'American Indian/Alaska Native', 'Native Hawaiian/Other Pacific Islander', 'Total', 'rank', 'Percent'], axis=1)

#states

In [None]:
states = states.rename(columns={'Pop':'Population', 'registrationCost':'CarRegistrationCost','costIndex':'CostIndex', 'costRank':'CostRank', 'groceryCost':'GroceryCost',
                                'housingCost':'HousingCost','utilitiesCost':'UtilitiesCost','transportationCost':'TransportationCost',
                                'miscCost':'MiscCost', 'homicideRate2017':'HomicideRate','firearmDeathRate':'FirearmDeathRate','firearmDeaths':'FirearmDeaths',
                                'debt':'Debt','overallRank':'EducationRank','higherEducationRank':'HigherEducationRank','prek12Rank':'PreK12Rank',
                                'employmentRate':'EmploymentRate','Two Or More Races':'MultiRace','totalHomeless':'TotalHomeless', 'totalHouseholds':'TotalHouseholds',
                                'totalVeterans':'TotalVeterans','totalYoungAdults':'TotalYoungAdults','Pop2018':'Population2018','Pop2010':'Population2010',
                                'growthSince2010':'GrowthSince2010','density':'Density', 'taxRank':'TaxRank', 'incomeTax':'IncomeTax', 'salesTax':'SalesTax',
                                'propertyTax':'PropertyTax', 'averageACTScore':'AvgACTScore', 'meetingEnglish':'MetEnglish', 'meetingReading':'MetReading',
                                'meetingMath':'MetMath', 'meetingScience':'MetScience'})

column_order = ['State', 'Population', 'CostIndex', 'Debt', 'Growth', 'EmploymentRate', 
                'Density', 'CarRegistrationCost', 'CostRank', 'GroceryCost', 'HousingCost', 'UtilitiesCost',
                'TransportationCost', 'MiscCost', 'HomicideRate', 'FirearmDeathRate','FirearmDeaths', 'PerCapita', 
                'EducationRank','HigherEducationRank', 'PreK12Rank', 'White', 'Black','Hispanic', 'Asian', 
                'MultiRace', 'TotalHomeless', 'TotalHouseholds','TotalVeterans', 'TotalYoungAdults', 'Population2018',
                'Population2010', 'GrowthSince2010', 'AirQualityIndex', 'OutdoorRecreation', 'Retail', 'IndoorRecreation', 'Food', 
                'TaxRank', 'IncomeTax', 'SalesTax', 'PropertyTax', 'AvgACTScore', 'MetEnglish','MetReading', 'MetMath', 'MetScience']
states = states.reindex(columns=column_order)

# states.to_csv("US_states_data.csv", index=False, encoding='utf-8-sig')
states

In [None]:
np.sum(np.sum(states.isna()))

# Data Visualization

In [None]:
states.columns

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(states,x='Population')
fig.show()

In [None]:
fig = px.scatter(states, x="State", y="Population", color="Debt",
                 size='Density')
fig.show()

In [None]:
import plotly.graph_objects as go


fig = go.Figure()

fig.add_trace(go.Violin(y=states['IndoorRecreation'],
                        box_visible=True,
                        meanline_visible=True))
fig.add_trace(go.Violin(y=states['OutdoorRecreation'],
                        box_visible=True,
                        meanline_visible=True))

fig.show()

#Data Analysis

In [None]:
states_copy = states.copy(deep=True)
states_copy = states_copy.drop(['State','CostRank','EducationRank','HigherEducationRank','PreK12Rank','TaxRank'],axis=1)
states_array = states_copy.to_numpy()
states_array.shape

##KMeans Clustering Algorithm

In [None]:
def euclid_dist(p1, p2):
    retval = 0

    for i in range(0,p1.shape[0]):
        retval += (p1[i]-p2[i])**2
    retval = np.sqrt(retval)

    return retval


def assign_to_cluster(P_row, centroids):
    return np.argmin([euclid_dist(P_row, centroids[i,:]) for i in range(0, centroids.shape[0])])


def calc_centroids(P, bins, k, bounds):

    centroids = []
    for i in range(0, k): # max bins+1 = k
       points = np.array([P[j,:] for j in range(0,P.shape[0]) if bins[j] == i]) # gets all points for a single cluster

       # if no points belong to this centroid, rerandomize the centroid, else calc with mean
       if points.size == 0:
           centroids.append(np.array([bounds[i,0] + (bounds[i,1]-bounds[i,0])*np.random.rand() for i in range(0,P.shape[1])]))
       else:
           centroids.append(np.array([np.mean(points[:,i]) for i in range(0, P.shape[1])])) 
    return np.array(centroids)

# The K-means algorithm function
def kmeans(P,k,eps=1e-4,max_iter=1e4):

    # Initialize k centroids randomly
    dims = P.shape[1] # find how many dimensions there are
    bounds = bounds = np.array([(np.min(P[:,i]), np.max(P[:,i])) for i in range(0,dims)]) # find the min and max of each dimension
    
    centroids =  []
    for i in range(0,k):
        centroids.append(np.array([bounds[i,0] + (bounds[i,1] - bounds[i,0])*np.random.rand() for i in range(0,dims)]))
    centroids = np.array(centroids)
    
    # Starting K-Means
    num_points = P.shape[0]
    bins = None
    iter = 0
    centroid_max_diff = np.inf

    while iter < int(max_iter) and centroid_max_diff > eps:

        # Step 1) want to assign data to centroids
        bins = np.array([assign_to_cluster(P[row,:], centroids) for row in range(0,num_points)])

        # Step 2) recalculate centroid position for every cluster
        centroids_prime = calc_centroids(P, bins, k, bounds)
        centroid_max_diff = np.max(np.abs(centroids_prime - centroids))
        centroids = centroids_prime

        # increment iter
        iter += 1

    return centroids,bins

##Testing Mine

In [None]:
centroids, bins = kmeans(states_array, 3)

In [None]:
import matplotlib.pyplot as plt

def myplot(k, bins, centroids, data, plt_type='hist'):
    k = 3
    f = plt.figure(figsize=(20,15))
    for i in range(0,k):
        points = data[bins == i]
        for j in range(0,points.shape[1]):
            plt.subplot(7,7,j+1)
            if plt_type == 'scatter':
                plt.scatter(points[:,0],points[:,j])
                plt.scatter(centroids[:,0],centroids[:,j],color='black')
            else:
                plt.hist(points[:,j],alpha=0.5,bins=10)
                plt.hist(centroids[:,j],color='black')

##Dimentionality Reduction

In [None]:
import numpy as np
from numpy.linalg import eig

def PCA(A): #Principal Component Analysis

    # first want the mean of the columns of A
    Mean_A = np.mean(np.transpose(A), axis=1)

    # second, center the columns by subtracting the mean from them
    Centered_A = A - Mean_A
    
    # now we need the coviariance of this centered matrix, the covariance is generalized and unnormalized
    Cov_A = np.cov(np.transpose(Centered_A))

    # now we can do the eigenvalue decomposition:
    eigvals, eigvecs = eig(Cov_A)
    print('\nValues:\n', eigvals)

    retval = np.dot(np.transpose(eigvecs), np.transpose(Centered_A))

    return np.transpose(retval)

In [None]:
states_reduced = PCA(states_array)
states_reduced

In [None]:
def clean_up(A):

    new_A = []
    count = 0
    A = np.transpose(A)

    for col in range(0,A.shape[0]):
        for row in range(0,A.shape[1]):

            # if a row contains values smaller than 0.01, count them as bad
            if np.abs(A[col][row]) <= 1e-2:
                count += 1

        # if more than two rows (rows being the states) have bad values, don't count that column        
        if count <= 2:
            new_A.append(np.array(A[col,:]))

    return np.transpose(np.array(new_A))

In [None]:
states_reduced_cleaned = clean_up(states_reduced)
states_reduced_cleaned

In [None]:
print(states_reduced_cleaned.shape)
print(states_array.shape)

##Retesting My KMeans with reduced data

In [None]:
K = 3
centroids2, bins2 = kmeans(states_reduced_cleaned,K)
myplot(K,bins2, centroids2, states_reduced_cleaned)

##Testing against Pythons KMeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans_test = KMeans(n_clusters=K,init='random')
python_version = kmeans_test.fit(states_reduced_cleaned)
myplot(K, python_version.labels_, python_version.cluster_centers_,states_reduced_cleaned)