In [1]:
# change the working directory to point to the root directory
import os

os.chdir("../")

In [2]:
# imports
import numpy as np
import pandas as pd

In [3]:
# imports for plots
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

pio.templates.default = "plotly_white"

In [4]:
data_region = pd.read_csv("raw_data/weather_regions/21.79102828115297;-14.47179828879979;Western Sahara;Western Sahara.csv")

In [12]:
# general params
nb_crop_years = 39

In [15]:
from py_scripts.clustering import univariate_clustering, multivariate_clustering

# cluster data (nb_cluster = 3 by default)
dict_univariate_clustering = univariate_clustering(data_region, col_name="cumulative_PRECTOT")
dict_multivariate_clustering = multivariate_clustering(data_region)

In [17]:
# gather results in a dataframe
lat, lon, region, name = 21.79102828115297, -14.47179828879979, "Western Sahara", "Western Sahara"

# prepare data frame
rg = range(nb_crop_years)
df_location = pd.DataFrame(
    { 
        "lat": [lat for i in rg],
        "lon": [lon for i in rg],
        "region": [region for i in rg],
        "name": [name for i in rg] 
    }
)

# form clusters columns for univariate clustering
dict_year_cluster_uni = { "year_PRECTOT": [], "cluster_PRECTOT": [] }
for cluster_name, cluster_years in dict_univariate_clustering.items():
    dict_year_cluster_uni["year_PRECTOT"] += list(cluster_years)
    dict_year_cluster_uni["cluster_PRECTOT"] += [cluster_name for i in range(len(cluster_years))]

# form clusters columns for multivariate clustering
dict_year_cluster_multi = { "year_multi": [], "cluster_multi": [] }
for cluster_name, cluster_years in dict_multivariate_clustering.items():
    dict_year_cluster_multi["year_multi"] += list(cluster_years)
    dict_year_cluster_multi["cluster_multi"] += [cluster_name for i in range(len(cluster_years))]

# add cols of both uni and multi to df_location
df_location = df_location.join(pd.DataFrame(dict_year_cluster_multi))
df_location = df_location.join(pd.DataFrame(dict_year_cluster_uni))

In [18]:
df_location

Unnamed: 0,lat,lon,region,name,year_multi,cluster_multi,year_PRECTOT,cluster_PRECTOT
0,21.791028,-14.471798,Western Sahara,Western Sahara,2011,High,2011,High
1,21.791028,-14.471798,Western Sahara,Western Sahara,2004,Medium,2004,Medium
2,21.791028,-14.471798,Western Sahara,Western Sahara,2016,Medium,2016,Medium
3,21.791028,-14.471798,Western Sahara,Western Sahara,2019,Medium,2019,Medium
4,21.791028,-14.471798,Western Sahara,Western Sahara,1982,Low,1982,Low
5,21.791028,-14.471798,Western Sahara,Western Sahara,1983,Low,1983,Low
6,21.791028,-14.471798,Western Sahara,Western Sahara,1984,Low,1984,Low
7,21.791028,-14.471798,Western Sahara,Western Sahara,1985,Low,1985,Low
8,21.791028,-14.471798,Western Sahara,Western Sahara,1986,Low,1986,Low
9,21.791028,-14.471798,Western Sahara,Western Sahara,1987,Low,1987,Low


In [21]:
# perform this procedure to all regions
from py_scripts.clustering import univariate_clustering, multivariate_clustering
from tqdm import tqdm


dir_name = "raw_data/weather_regions/"
big_data = pd.DataFrame()

for file in tqdm(os.listdir(dir_name)):
    lat, lon, region, name = file[:-4].split(";")

    # prepare data frame
    rg = range(nb_crop_years)
    df_location = pd.DataFrame(
        { 
            "lat": [float(lat) for i in rg],
            "lon": [float(lon) for i in rg],
            "region": [region for i in rg],
            "name": [name for i in rg] 
        }
    )

    # load data region
    data_region = pd.read_csv(dir_name + file)

    # perform cluster data (nb_cluster = 3 by default)
    dict_univariate_clustering = univariate_clustering(data_region, col_name="cumulative_PRECTOT")
    dict_multivariate_clustering = multivariate_clustering(data_region)

    # form clusters columns for univariate clustering
    dict_year_cluster_uni = { "year_PRECTOT": [], "cluster_PRECTOT": [] }
    for cluster_name, cluster_years in dict_univariate_clustering.items():
        dict_year_cluster_uni["year_PRECTOT"] += list(cluster_years)
        dict_year_cluster_uni["cluster_PRECTOT"] += [cluster_name for i in range(len(cluster_years))]
    
    # form clusters columns for multivariate clustering
    dict_year_cluster_multi = { "year_multi": [], "cluster_multi": [] }
    for cluster_name, cluster_years in dict_multivariate_clustering.items():
        dict_year_cluster_multi["year_multi"] += list(cluster_years)
        dict_year_cluster_multi["cluster_multi"] += [cluster_name for i in range(len(cluster_years))]
    
    # add cols of both uni and multi to df_location
    df_location = df_location.join(pd.DataFrame(dict_year_cluster_multi))
    df_location = df_location.join(pd.DataFrame(dict_year_cluster_uni))

    # stack df location of current location in big_data
    big_data = big_data.append(df_location, ignore_index=True)

100%|██████████| 705/705 [09:33<00:00,  1.23it/s]


In [23]:
big_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27495 entries, 0 to 27494
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   lat              27495 non-null  float64
 1   lon              27495 non-null  float64
 2   region           27495 non-null  object 
 3   name             27495 non-null  object 
 4   year_multi       27495 non-null  int64  
 5   cluster_multi    27495 non-null  object 
 6   year_PRECTOT     27495 non-null  int64  
 7   cluster_PRECTOT  27495 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 1.7+ MB


In [27]:
query = (big_data["year_multi"] == 2010) & (big_data["cluster_multi"] == "High")

big_data[query]

Unnamed: 0,lat,lon,region,name,year_multi,cluster_multi,year_PRECTOT,cluster_PRECTOT
10700,27.791028,-9.671798,Guelmim - Es-Semara,Assa-Zag,2010,High,2010,High
10739,27.791028,-9.971798,Guelmim - Es-Semara,Assa-Zag,2010,High,2010,High
10771,28.091028,-10.271798,Guelmim - Es-Semara,Assa-Zag,2010,High,2006,High
11239,28.391028,-10.271798,Guelmim - Es-Semara,Assa-Zag,2010,High,2006,High
11593,28.691028,-10.271798,Guelmim - Es-Semara,Guelmim,2010,High,2010,High
...,...,...,...,...,...,...,...,...
27301,35.291028,-5.771798,Tangier-Tetouan,Dhar Ennhal,2010,High,1991,High
27340,35.291028,-6.071798,Tangier-Tetouan,Ghabat Mnazel Lekrota,2010,High,1991,High
27378,35.591028,-5.471798,Tangier-Tetouan,Bounazzal 1,2010,High,1990,High
27425,35.591028,-5.771798,Tangier-Tetouan,Talae Dahnoun,2010,High,2009,Medium


In [30]:
# save big data
# big_data.to_csv("processed_data/summary_clustering_3.csv", index=False)