In [None]:
# only run this on Google Colab
!pip install geopandas
!pip install folium

In [None]:
import pandas as pd
import seaborn as sns
import folium

In [None]:
main_df = pd.read_csv('https://raw.githubusercontent.com/Dynovski/BESTHL-AI/main/train_data.csv?token=GHSAT0AAAAAABTLWN7I6LMAI3BK5ZSQSE7WYSZQ4DA', error_bad_lines=False)
print(main_df.head())

# Basic stuff

In [None]:
main_df.shape

In [None]:
main_df.columns

In [None]:
main_df.info()

In [None]:
main_df.describe()

In [None]:
main_df['POSTCODE'].value_counts()

In [None]:
sns.scatterplot(data=main_df, x=main_df.index, y=main_df["FULLVAL"])

In [None]:
type(main_df['TAXCLASS'][0])

In [None]:
main_df['TAXCLASS'].head(19)

In [None]:
sns.catplot(data=main_df, x=main_df.index, y=main_df["TAXCLASS"])

# Map

In [None]:
main_df['Latitude'].value_counts().head(30)

In [None]:
main_df['Longitude'].value_counts().head(30)

In [None]:
# Watch out! sampling resets index
map_df = main_df[main_df['Latitude'].notna()].sample(frac=1).reset_index(drop=True)

# using too big location_number param results in slow map 
location_number = 1000

map_df = map_df[map_df['Longitude'].notna()].head(location_number)
print(map_df.shape)

In [None]:
map_df['Latitude'].loc[0]
type(map_df['Latitude'].loc[0])

In [None]:
map_df['Longitude'].loc[0]
type(map_df['Latitude'].loc[0])

In [None]:
map_full = folium.Map(location=[40.719232, -74.00809], zoom_start=9)

In [None]:
for index, _ in map_df.iterrows():
    folium.Marker([map_df['Latitude'].loc[index], map_df['Longitude'].loc[index]]).add_to(map_full)

In [None]:
map_full

## Separating clusters

### k-means

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans2, whiten

In [None]:
coordinates_kmeans = [[map_df['Latitude'].loc[index], map_df['Longitude'].loc[index]] for index, _ in map_df.iterrows()]
coordinates_kmeans = np.array(coordinates_kmeans)
print(type(coordinates_kmeans))
print(len(coordinates_kmeans))
print(coordinates_kmeans[19])
print(type(coordinates_kmeans[19]))

In [None]:
x, y = kmeans2(whiten(coordinates_kmeans), 5, iter = 20)  
plt.scatter(coordinates_kmeans[:,0], coordinates_kmeans[:,1], c=y)
plt.show()

### DBSCAN

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

In [None]:
coordinates_dbscan = map_df[['Latitude', 'Longitude']].to_numpy()

In [None]:
kms_per_radian = 6371.0088
epsilon = 1.7 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=10, algorithm='ball_tree', metric='haversine').fit(np.radians(coordinates_dbscan))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coordinates_dbscan[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

In [None]:
print(clusters.shape)
print(clusters.info())
print(type(clusters[2]))
print(len(clusters[2]))

In [None]:
map_dbscan = folium.Map(location=[40.719232, -74.00809], zoom_start=9)

In [None]:
for place in clusters[0]:
    folium.Marker([place[0], place[1]], icon=folium.Icon(color="orange")).add_to(map_dbscan)

In [None]:
for place in clusters[1]:
    folium.Marker([place[0], place[1]], icon=folium.Icon(color="blue")).add_to(map_dbscan)

In [None]:
for place in clusters[2]:
    folium.Marker([place[0], place[1]], icon=folium.Icon(color="purple")).add_to(map_dbscan)

In [None]:
for place in clusters[3]:
    folium.Marker([place[0], place[1]], icon=folium.Icon(color="green")).add_to(map_dbscan)

In [None]:
for place in clusters[4]:
    folium.Marker([place[0], place[1]], icon=folium.Icon(color="red")).add_to(map_dbscan)

In [None]:
for place in clusters[5]:
    folium.Marker([place[0], place[1]], icon=folium.Icon(color="gray")).add_to(map_dbscan)

In [None]:
map_dbscan