In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [2]:
sun_df=pd.read_csv("pv_open_2020.csv")
sun_df.head()

Unnamed: 0,sc_gid,capacity_factor,global_horizontal_irradiance,capacity_mw,area_sq_km,latitude,longitude,distance_to_transmission_km
0,0,0.139177,3.354821,591.452887,18.482903,48.994,-122.735,0.673354
1,1,0.137451,3.306946,596.647266,18.645227,49.022,-122.575,6.875389
2,2,0.136,3.293,2.8512,0.0891,49.049,-122.414,7.901461
3,3,0.141827,3.417382,2014.383178,62.949474,48.9,-122.688,2.910129
4,4,0.137829,3.327924,3557.400778,111.168774,48.927,-122.529,102.926423


In [3]:
sun_graph = sun_df[["latitude","longitude"]]
sun_graph

Unnamed: 0,latitude,longitude
0,48.994,-122.735
1,49.022,-122.575
2,49.049,-122.414
3,48.900,-122.688
4,48.927,-122.529
...,...,...
55514,25.982,-97.349
55515,25.983,-97.231
55516,25.877,-97.583
55517,25.879,-97.465


In [4]:
sun_df=sun_df.drop(["latitude","longitude","sc_gid","area_sq_km"],axis=1)
sun_df

Unnamed: 0,capacity_factor,global_horizontal_irradiance,capacity_mw,distance_to_transmission_km
0,0.139177,3.354821,591.452887,0.673354
1,0.137451,3.306946,596.647266,6.875389
2,0.136000,3.293000,2.851200,7.901461
3,0.141827,3.417382,2014.383178,2.910129
4,0.137829,3.327924,3557.400778,102.926423
...,...,...,...,...
55514,0.206913,5.196184,965.520000,22.586464
55515,0.205765,5.174338,238.679140,9.374740
55516,0.208000,5.208001,25.093151,4.844146
55517,0.207652,5.203368,330.091200,5.204726


In [5]:
sun_df_scaled = StandardScaler().fit_transform(sun_df)
sun_df_scaled

array([[-2.15484061, -2.08699882, -2.2205423 , -1.57339162],
       [-2.22390264, -2.16670573, -2.21620447, -1.43090917],
       [-2.28196766, -2.18992348, -2.71208366, -1.40733671],
       ...,
       [ 0.59941917,  0.9983273 , -2.69350941, -1.47757394],
       [ 0.58550255,  0.99061398, -2.43880548, -1.46929014],
       [ 0.62434414,  1.03458846, -2.0529995 , -1.33171333]])

In [6]:
inertia = []
k = list(range(1, 11))
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(sun_df)
    inertia.append(km.inertia_)

In [7]:
elbow_data = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.hvplot.line(x="k", y="inertia", title="L-bow Curve", xticks=k)

In [8]:
pca = PCA(n_components=3)

In [9]:
sun_pca = pca.fit_transform(sun_df_scaled)

In [10]:
sun_pca_df = pd.DataFrame(
    data=sun_pca,columns=["PC 1", "PC 2","PC 3"]
)
sun_pca_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,3.618901,1.704801,0.634209
1,3.680423,1.580717,0.744912
2,3.806596,1.905688,1.103321
3,3.277867,0.863052,-0.147916
4,2.674066,-1.574093,0.672937


In [11]:
model = KMeans(n_clusters=2)

model.fit(sun_pca_df)

predictions = model.predict(sun_pca_df)

sun_pca_df["class"] = model.labels_
sun_pca_df.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
0,3.618901,1.704801,0.634209,0
1,3.680423,1.580717,0.744912,0
2,3.806596,1.905688,1.103321,0
3,3.277867,0.863052,-0.147916,0
4,2.674066,-1.574093,0.672937,0


In [12]:
sun_pca_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["class"],
    by="class",
)

In [13]:
sun_class=sun_graph.join(sun_pca_df)
sun_class

Unnamed: 0,latitude,longitude,PC 1,PC 2,PC 3,class
0,48.994,-122.735,3.618901,1.704801,0.634209,0
1,49.022,-122.575,3.680423,1.580717,0.744912,0
2,49.049,-122.414,3.806596,1.905688,1.103321,0
3,48.900,-122.688,3.277867,0.863052,-0.147916,0
4,48.927,-122.529,2.674066,-1.574093,0.672937,0
...,...,...,...,...,...,...
55514,25.982,-97.349,-0.447798,2.372477,0.409901,1
55515,25.983,-97.231,-0.216796,2.982585,0.605587,0
55516,25.877,-97.583,-0.259264,3.205770,0.641597,1
55517,25.879,-97.465,-0.286748,3.013336,0.476225,1


In [14]:
sun_class.hvplot.scatter(
    x="longitude",
    y="latitude",
    hover_cols=["class"],
    by="class",
)