In [22]:
import pandas as pd
import numpy as np 
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [23]:
pd.options.display.max_columns = None
directory=os.getcwd().replace('notebooks','data\\output\\census_elections\\')
filename="census_election_totals_and_percent_data.csv"
full_name=f"{directory}{filename}"
full_df=pd.read_csv(full_name).set_index(['District_Name_x','Electoral_District_Number'])
del full_df['employment_employed']
del full_df['employment_unemployed']
del full_df['employment_not_in_the_labour_force']
del full_df['District_Name_y']
# del full_df['District_Name_x']
del full_df['employment_employment_rate']
del full_df['employment_participation_rate']
del full_df['ALT_GEO_CODE_y']
del full_df['ALT_GEO_CODE_x']
y = full_df['employment_unemployment_rate']
X = full_df.drop('employment_unemployment_rate',1).fillna(0)
categorical_fields=X.select_dtypes(include='object').columns
X=pd.get_dummies(data=X,columns=categorical_fields,drop_first=True)
black_districts_df=X.nlargest(120,'ethnic_percent_black')
black_districts_df=black_districts_df.filter(like='percent')

In [24]:
class PFA(object):
    def __init__(self, n_features,check_optimal_features=False, q=None):
        self.q = q
        self.n_features = n_features
        self.check_optimal_features=check_optimal_features
        self.indices_= None
        self.features_= None

    def fit(self, X):
        if not self.q:
            self.q = X.shape[1]
#             print(self.q)

        sc = StandardScaler()
        X = sc.fit_transform(X)

#         pca = PCA(n_components=self.q).fit(X) # calculation Cov matrix is embeded in PCA
        pca = PCA(n_components=0.95).fit(X)
        A_q = pca.components_.T
        kmeans = KMeans(n_clusters=self.n_features,random_state=12).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_

        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))
        self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features_ = X[:, self.indices_]
        

# Usage
pfa = PFA(n_features=5)
pfa.fit(black_districts_df)
# To get the transformed matrix
x = pfa.features_
# To get the column indices of the kept features
column_indices = pfa.indices_
column_indices

[116, 137, 3, 136, 23]

In [25]:
column_names=list(black_districts_df.columns[column_indices])
column_indices

[116, 137, 3, 136, 23]

In [26]:
scaled_df=pd.DataFrame(x,columns=column_names,index=black_districts_df.index)
kmeans = KMeans(n_clusters=5,random_state=12).fit(scaled_df)
black_districts_df2=black_districts_df[column_names]
black_districts_df2['clusters']=kmeans.predict(scaled_df)
black_districts_df2.groupby('clusters').describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0_level_0,industry_percent_professional_scientific_and_technical_services,industry_percent_professional_scientific_and_technical_services,industry_percent_professional_scientific_and_technical_services,industry_percent_professional_scientific_and_technical_services,industry_percent_professional_scientific_and_technical_services,industry_percent_professional_scientific_and_technical_services,industry_percent_professional_scientific_and_technical_services,industry_percent_professional_scientific_and_technical_services,immigrant_category_percent_immigrants,immigrant_category_percent_immigrants,immigrant_category_percent_immigrants,immigrant_category_percent_immigrants,immigrant_category_percent_immigrants,immigrant_category_percent_immigrants,immigrant_category_percent_immigrants,immigrant_category_percent_immigrants,employment_type_percent_3_health_occupations,employment_type_percent_3_health_occupations,employment_type_percent_3_health_occupations,employment_type_percent_3_health_occupations,employment_type_percent_3_health_occupations,employment_type_percent_3_health_occupations,employment_type_percent_3_health_occupations,employment_type_percent_3_health_occupations,immigrant_category_percent_non-immigrants,immigrant_category_percent_non-immigrants,immigrant_category_percent_non-immigrants,immigrant_category_percent_non-immigrants,immigrant_category_percent_non-immigrants,immigrant_category_percent_non-immigrants,immigrant_category_percent_non-immigrants,immigrant_category_percent_non-immigrants,education_level_percent_apprenticeship_or_trades_certificate_or_diploma,education_level_percent_apprenticeship_or_trades_certificate_or_diploma,education_level_percent_apprenticeship_or_trades_certificate_or_diploma,education_level_percent_apprenticeship_or_trades_certificate_or_diploma,education_level_percent_apprenticeship_or_trades_certificate_or_diploma,education_level_percent_apprenticeship_or_trades_certificate_or_diploma,education_level_percent_apprenticeship_or_trades_certificate_or_diploma,education_level_percent_apprenticeship_or_trades_certificate_or_diploma
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
clusters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2
0,30.0,0.134263,0.030145,0.094181,0.109249,0.128327,0.149072,0.226073,30.0,0.330344,0.062361,0.203772,0.281756,0.344448,0.376593,0.424046,30.0,0.061197,0.00887,0.04769,0.053362,0.060831,0.064536,0.082767,30.0,0.639176,0.065557,0.528417,0.585773,0.629342,0.686589,0.770903,30.0,0.067547,0.032903,0.024492,0.046217,0.059963,0.072557,0.151812
1,29.0,0.067518,0.018454,0.036334,0.05101,0.066054,0.081335,0.108428,29.0,0.231266,0.081681,0.039886,0.188659,0.22514,0.278896,0.392637,29.0,0.06213,0.008619,0.035322,0.057227,0.06317,0.067638,0.076194,29.0,0.756629,0.084504,0.58936,0.713973,0.758658,0.786149,0.957791,29.0,0.164855,0.048684,0.07113,0.126698,0.162215,0.196868,0.246516
2,30.0,0.083065,0.024591,0.045703,0.064403,0.078755,0.100287,0.149431,30.0,0.533994,0.076098,0.38884,0.494341,0.538318,0.580094,0.666735,30.0,0.056626,0.008561,0.036914,0.05152,0.055379,0.062888,0.074308,30.0,0.442482,0.081576,0.30043,0.393161,0.439212,0.493498,0.606755,30.0,0.091838,0.045693,0.037671,0.071176,0.081899,0.101603,0.277412
3,18.0,0.078274,0.023647,0.044646,0.059764,0.076086,0.097287,0.122224,18.0,0.259849,0.089116,0.114724,0.194424,0.260254,0.319677,0.468175,18.0,0.084655,0.008515,0.069838,0.079251,0.083029,0.090637,0.098154,18.0,0.714713,0.092667,0.493763,0.656647,0.709564,0.797424,0.866846,18.0,0.096664,0.025104,0.061961,0.075255,0.092432,0.116048,0.13979
4,13.0,0.059886,0.011984,0.039379,0.053405,0.05853,0.066694,0.086933,13.0,0.162073,0.107498,0.073441,0.088598,0.099311,0.196765,0.408205,13.0,0.077429,0.012502,0.061109,0.068826,0.075915,0.081697,0.108999,13.0,0.831774,0.109208,0.579192,0.793681,0.885199,0.908304,0.924636,13.0,0.308744,0.049314,0.230637,0.284063,0.293973,0.345491,0.392903
