In [1]:
import sys
sys.path.append('../src/')


In [2]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
from sklearn.cluster import KMeans
from shapely.ops import cascaded_union
from shapely.geometry import MultiPolygon
from sklearn.linear_model import LinearRegression



In [3]:
%matplotlib inline

# Useful imports
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from constants import *
from utils.publication_functions import beautify_data_column_name
from sklearn.cluster import KMeans
from flows.specific.custom_flow import CustomFlow

# Data sources
from data_sources.specific.ibd import IBD
from data_sources.specific.coca import Coca
from data_sources.specific.gold_export import GoldExport
from data_sources.specific.gold_stock_price import GoldStockPrice
from data_sources.specific.malaria_cases import Malaria
from data_sources.specific.mock_malaria_cases import MockMalaria
from data_sources.specific.temperature_average import TemperatureAverage
from data_sources.specific.temperature_max import TemperatureMax
from data_sources.specific.temperature_min import TemperatureMin
from data_sources.specific.temperature_average import TemperatureAverage
from data_sources.specific.precipitation_average import PrecipitationAverage
from data_sources.specific.precipitation_total import PrecipitationTotal
from data_sources.specific.deforestation_average import DeforestationAverage
from data_sources.specific.deforestation_total import DeforestationTotal
from data_sources.specific.fb_mobility_recent import FBMobilityRecent
from data_sources.specific.fb_mobility_all import FBMobilityAll

# Embedders
from embedders.specific.identity_embedder import IdentityEmbbeder
from embedders.specific.linear_regression_coefficient_embedder import LinearRegressionCoefficientEmbedder
from embedders.specific.mobility_to_distance_embedder import MobilityToDistanceEmbeder
from embedders.specific.mobility_to_similarity_embedder import MobilityToSimilarityEmbeder
from embedders.specific.aggregation_embedder import AggregationEmbedder

# Clusteres
from clusterers.specific.identity_clusterer import IdentityClusterer
from clusterers.specific.quantile_clusterer import QuantileClusterer
from clusterers.specific.sklearn_vector_clusterer import SklearnVectorClusterer
from clusterers.specific.two_tier_dbscan_clusterer import TwoTierDBSCANClusterer
from clusterers.specific.similarity_community_clusterer import SimilarityCommunityClusterer

# Geographies
from geography.specific.colombian_municipalities import ColombianMunicipalities
from geography.specific.choco_municipalities import ChocoMunicipalities
from geography.specific.colombia_grid import ColombianGrid
from geography.specific.colombia_rivers import ColombianRivers
from geography.general.geography_from_flow_output import GeographyFromFlowOutput
from geography.specific.colombia_indg_com import ColombianIndgCom
from geography.specific.colombia_indg_terr import ColombianIndgTerr

# Flows
from flows.specific.custom_flow import  CustomFlow

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [4]:
fuentes_incluidas = [IBD]
# target_col=target_col = 'malaria_num_cases_sum'

In [5]:
flow = CustomFlow(ID = 'basic_analysis',
                    name = "Basic Analysis",
                    time_resolution = MONTH,
                    geography = ColombianMunicipalities(),
                    vector_data_sources=[],
                    matrix_data_sources=[FBMobilityRecent],
                    embedder=IdentityEmbbeder(),
                    clusterer=IdentityClusterer()
                    )

In [6]:
flow.run()

Starts Flow: Basic Analysis
   Loads Initial Geography
   Loads Data
      Loads Vector Data
      Found 0 elements
      Done
      
      Loads Matrix Data
      Found 1 elements
         Extracts 1 of 1: Facebook Mobility All (fb_mobility_all) 


ValueError: No objects to concatenate

In [7]:
flow.df_matrix

Unnamed: 0,date,ID_1,ID_2,fb_mobility_movement
0,2021-06-30,5001,5001,4088661.0
1,2021-06-30,50006,50251,0.0
2,2021-06-30,50006,50270,0.0
3,2021-06-30,50006,50287,0.0
4,2021-06-30,50006,50313,265.0
...,...,...,...,...
15776,2021-06-30,19212,76520,0.0
15777,2021-06-30,19212,76563,0.0
15778,2021-06-30,19256,19001,496.0
15779,2021-06-30,19256,19130,0.0


In [111]:
df_ibd = flow.df_matrix
df_ibd.head()

Unnamed: 0,date,ID_1,ID_2,ibd_ibd
0,2020-01-31,76109,19318,0.04654
1,2020-01-31,76109,27001,0.26423
2,2020-02-29,76109,19318,0.11651
3,2020-02-29,76109,76109,0.99849
4,2020-03-31,76109,19318,0.30528
5,2020-03-31,76109,27001,0.18697
6,2020-03-31,76109,76109,0.2195
7,2020-04-30,76109,19318,0.08554
8,2020-04-30,76109,27001,0.07702
9,2020-04-30,76109,76109,0.08857


In [92]:
df_edges= pd.read_csv("/Users/andreaparra/Dropbox/4_Work/DataLamaCovid/data/covid_fb/data/data_stages/colombia/constructed/geometry/daily_graphs/edges.csv")
df_nodes= pd.read_csv("/Users/andreaparra/Dropbox/4_Work/DataLamaCovid/data/covid_fb/data/data_stages/colombia/constructed/geometry/daily_graphs/nodes.csv")



In [93]:
df_nodes["ID_1"] = df_nodes["node_id"]
df_nodes["ID_2"] = df_nodes["node_id"]
df_nodes.rename(columns={"inner_movement": "fb_mobility", "date_time":"date"}, inplace=True)
df_nodes.drop(columns=["day", "node_id", "num_cases", "population"], inplace=True)
df_nodes.head()


Unnamed: 0,date,fb_mobility,ID_1,ID_2
0,2020-04-02,2258616.0,11001,11001
1,2020-04-02,65808.0,13001,13001
2,2020-04-02,65.0,13006,13006
3,2020-04-02,207.0,13030,13030
4,2020-04-02,295.0,13042,13042


In [94]:
df_edges.rename(columns={"movement": "fb_mobility", 
                        "date_time":"date",
                        "start_id":"ID_1",
                        "end_id": "ID_2"}, inplace=True)
df_edges.drop(columns=["day"], inplace=True)
df_edges.head()

Unnamed: 0,date,ID_1,ID_2,fb_mobility
0,2020-04-02,11001,25099,210
1,2020-04-02,11001,25175,638
2,2020-04-02,11001,25178,10
3,2020-04-02,11001,25181,11
4,2020-04-02,11001,25200,213


In [98]:
df_movement = pd.concat([df_edges, df_nodes])
df_movement["date"] = df_movement["date"].apply(lambda x: pd.Timestamp(x))
df_movement.head()

Unnamed: 0,date,ID_1,ID_2,fb_mobility
0,2020-04-02,11001,25099,210.0
1,2020-04-02,11001,25175,638.0
2,2020-04-02,11001,25178,10.0
3,2020-04-02,11001,25181,11.0
4,2020-04-02,11001,25200,213.0


In [108]:
for d in range(0,15):
    # shift movement
    df_movement_tmp = df_movement.copy()
    df_movement_tmp["date"]  = df_movement["date"] + pd.Timedelta(days=d)
    
    df = df_ibd.merge(df_movement_tmp, on=["date", "ID_1", "ID_2"], how="outer").dropna()

    X = df["ibd_ibd"].to_numpy().reshape(-1, 1)
    y = df["fb_mobility"].to_numpy()

    reg = LinearRegression().fit(X, y)
    m = reg.coef_
    b = reg.intercept_
    print(d, reg.score(X, y), f"y = {m}x + {b}")


0 0.004123724794502626 y = [2053.85784025]x + 36865.16512706096
1 0.03674386210642777 y = [3137.25533185]x + 37989.81601129267
2 0.04495676962595441 y = [3412.57393077]x + 37930.88180036574
3 0.08021005671141235 y = [9692.28346309]x + 32766.931531512586
4 0.035897450685744015 y = [6040.67827172]x + 35489.25075508544
5 0.04629585463093411 y = [7522.98850289]x + 32382.87959205008
6 0.004621088272683482 y = [2301.42803138]x + 35234.18422797056
7 0.02216957110393325 y = [4736.88287595]x + 35871.85148831285
8 0.040332088647528797 y = [4954.98496064]x + 34693.90418431521
9 0.03277966286891254 y = [-7822.69343418]x + 34306.24102938543
10 0.680986562376101 y = [-29004.60144194]x + 43331.93833773883
11 0.011673731831466916 y = [3544.93536951]x + 36063.700887311665
12 0.04925568873706343 y = [7994.51492636]x + 32539.554835897452
13 0.02332394085205347 y = [5037.67511071]x + 35608.609772483906
14 0.1785922138983771 y = [-14125.53288564]x + 39320.85834929376


In [112]:
df_movement_tmp = df_movement.copy()
df_movement_tmp["date"]  = df_movement["date"] + pd.Timedelta(days=14)

df = df_ibd.merge(df_movement_tmp, on=["date", "ID_1", "ID_2"], how="outer").dropna()
df

Unnamed: 0,date,ID_1,ID_2,ibd_ibd,fb_mobility
9,2020-04-30,76109,76109,0.08857,41559.0
11,2020-05-31,76109,76109,0.11429,42426.0
14,2020-06-30,76109,76109,0.26787,41272.0
15,2020-07-31,76109,76109,0.2869,39466.0
19,2020-08-31,76109,76109,0.09777,39813.0
22,2020-09-30,76109,76109,0.23233,13287.0
25,2020-10-31,76109,76109,0.10838,39922.0
27,2020-11-30,76109,76109,0.12173,36830.0
30,2020-12-31,76109,76109,1.0,26572.0


In [113]:
df.to_csv("~/Desktop/caucaseco/ibd_mobility_td_14.csv", index=False)

In [59]:
df_ibd.date.min()

Timestamp('2021-01-30 00:00:00')