In [1]:
import pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.cluster import KMeans

# Plotly Setup
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()

## Loading Dataset

In [2]:
fires = pd.read_csv("./data/worldwide/fire_nrt_V1_26815.csv")
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3218304 entries, 0 to 3218303
Data columns (total 14 columns):
latitude      float64
longitude     float64
bright_ti4    float64
scan          float64
track         float64
acq_date      object
acq_time      int64
satellite     object
instrument    object
confidence    object
version       object
bright_ti5    float64
frp           float64
daynight      object
dtypes: float64(7), int64(1), object(6)
memory usage: 343.8+ MB


### Quick Viz

In [3]:
FEATURE_NAMES = [
    "latitude",
    "longitude"
]

X = fires[FEATURE_NAMES]

In [7]:
X = X[X.shape[0]-50_000:]

In [8]:
X.head()

Unnamed: 0,latitude,longitude
3208304,49.87434,-121.09469
3208305,47.57359,-103.20007
3208306,47.57173,-103.20183
3208307,49.87304,-121.09866
3208308,49.87806,-121.09816


## Evaluating Clustering Methods

In [9]:
lat = list(map(str, X["latitude"]))
lon = list(map(str, X["longitude"]))

In [10]:
def plot(centroids, labels):
    sns.scatterplot(X["latitude"], X["longitude"], hue=labels)
    plt.scatter(centroids[:, 0], centroids[:, 1], marker="o", s=300, c="yellow")

def plot_map(centroids=None):
    mpis = [
        {'lat': lat,
      'lon': lon,
      'marker': {'color': 'rgb(0,116,217)',
       'line': {'color': 'rgb(40,40,40)', 'width': 0.5},
       'size': 5,
       'sizemode': 'diameter'},
      'text': '0.387',
      'type': 'scattergeo'}
    ]
    
    if not(centroids is None):
        mpis.append(
            {'lat': list(map(str, centroids[:, 0])),
          'lon': list(map(str, centroids[:, 1])),
          'marker': {'color': 'rgb(255,255,0)',
           'line': {'color': 'rgb(40,40,40)', 'width': 0.5},
           'size': 15,
           'sizemode': 'diameter'},
          'text': '0.387',
          'type': 'scattergeo'}
        )

    layout = go.Layout(
        title = 'MAP',
        showlegend = True,
        geo = dict(
                scope='world',
                projection=dict(type = 'natural earth'),
                showland = True,
                landcolor = 'rgb(217, 217, 217)',
                subunitwidth=1,
                countrywidth=1,
                subunitcolor="rgb(255, 255, 255)",
                countrycolor="rgb(255, 255, 255)"
            ),)

    fig = go.Figure(layout=layout, data=mpis)
    iplot(fig, validate=False)

In [11]:
plot_map()

### K-Means

In [12]:
k_means = KMeans(n_clusters=10_000, init="k-means++", n_init=5, max_iter=100, n_jobs=-1)

In [13]:
k_means.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=10000, n_init=5, n_jobs=-1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [14]:
plot_map(k_means.cluster_centers_)

# Saving model

In [16]:
with open("../../models/fire_clustering.b", "wb") as f:
    pickle.dump(k_means, f)