In [1]:
if 'google.colab' in str(get_ipython()):
    import sys
    from google.colab import drive, output
    drive.mount('/content/drive')
    !pip install torchaudio
    !pip install wandb --upgrade
    output.clear()
    print("Running on colab")
    %load_ext autoreload
    %autoreload 1
    %cd '/content/drive/MyDrive/PhD_Thesis_Experiments/DeepLearning/AutoEncoders/Project'
    root = "/content/drive/MyDrive/PhD_Thesis_Experiments/DeepLearning/AutoEncoders/Project"
else:
    print("Running local")
    root = "/home/mirp_ai/Documents/Daniel_Nieto/PhD/AutoEncoders_Ecoacoustics"
    root_path = "/media/mirp_ai/Seagate Desktop Drive/Jaguas_2018"


Running local


In [2]:
import os
import numpy as np

import datetime
from datetime import timedelta

import torch
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
from torchvision.utils import make_grid
from Jaguas_DataLoader_rainless import SoundscapeData
import matplotlib.pyplot as plt
import pandas as pd
import folium
from folium.plugins import HeatMap
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [3]:
class df_generator():
    def __init__(self, root_clusters, root_audios, root_recorders, n_clusters_index = 0):
        self.n_clusters_index = n_clusters_index
        self.cluster_names = os.listdir(f"{root_clusters}")
        self.torch_clusters = torch.load(f"{root_clusters}/{self.cluster_names[n_clusters_index]}")
        self.dataframe_clusters = pd.DataFrame(self.torch_clusters)
        self.dataframe_clusters = self.dataframe_clusters.transpose()
        self.dataframe_audios = pd.read_csv(f"{root_audios}", index_col=0)
        self.dataframe_recorders = pd.read_csv(f"{root_recorders}",sep = ";", index_col = "Recorder")
        self.df_clusters_len = len(self.dataframe_clusters)
        self.df_recorders_len = len(self.dataframe_recorders)
        self.n_clusters = np.arange(len(self.torch_clusters))
        
        for i in self.n_clusters:
            self.dataframe_audios[f"Cluster {i}"] = 0
        self.dataframe_audios.set_index("Filename", drop=False, inplace=True)
        for i in self.n_clusters:
            counts = self.dataframe_clusters[i].value_counts()
            c_index = counts.index
            c_value = counts.values
            for j in range(len(counts)):
                self.dataframe_audios.loc[c_index[j], f"Cluster {i}"] = c_value[j]

        self.dataframe_audios["Cluster Sum"] = self.dataframe_audios.loc[:,"Cluster 0": f"Cluster {len(self.n_clusters)-1}"].sum(axis=1)
        self.dataframe_audios = self.dataframe_audios[self.dataframe_audios["Cluster Sum"]!=0]
        self.dataframe_audios.set_index(pd.Index(range(0,len(self.dataframe_audios))), inplace=True)


    def show_clusters(self, keyword=None, plot=True):
        cluster_names = []
        index = []
        if keyword != None:
            for i in range(len(self.cluster_names)):
                if keyword in self.cluster_names[i]:
                    cluster_names.append(self.cluster_names[i])
                    index.append(i)
                else:
                    pass
        else:
            for i in range(len(self.cluster_names)):
                    cluster_names.append(self.cluster_names[i])
                    index.append(i)
                    
        if plot==True:
            for i in range(len(cluster_names)):
                print(f"{index[i]}: {cluster_names[i]}")
        else:
            pass
        
        return cluster_names, index

    def init_clusters(self):
        for cluster in self.n_clusters:
            self.dataframe_recorders[f"Cluster {cluster}"] = 0
    
#     def create_clusters(self, count_cluster=False):
#         self.init_clusters()
#         for cluster in self.n_clusters:
#             for i in range(self.df_clusters_len):  
#                 if self.dataframe_clusters.iloc[i][cluster] != None:
#                     self.dataframe_recorders.loc[self.dataframe_clusters.iloc[i][cluster].split("_")[0], f"Cluster {cluster}"] +=1
#                 else:
#                     pass
#         if count_cluster == True:
#             self.count_cluster_data()
#         else:
#             pass
    
    def create_clusters_v2(self, count_cluster=False):
        self.init_clusters()
        for cluster in self.n_clusters:
            for i in range(len(self.dataframe_audios)): 
                self.dataframe_recorders.loc[self.dataframe_audios.loc[i, "Filename"].split("_")[0], f"Cluster {cluster}"] += self.dataframe_audios.loc[i, f"Cluster {cluster}"]
        if count_cluster == True:
            self.count_cluster_data()
        else:
            pass
    
    def count_cluster_data(self):
        clusters = [f"Cluster {i}" for i in range(0, len(self.n_clusters))]
        self.dataframe_recorders["Total_Clustering_Audios"] = self.dataframe_recorders[clusters].sum(axis=1)
    
    def GLM_dataframe(self):
        GLM = self.dataframe_recorders.copy()
        columns = [f"Cluster {i}" for i in range(0, len(self.n_clusters))]
        columns.append("Total_Clustering_Audios")
        GLM = GLM[columns]
        for cluster in range(len(columns[0:-1])):
            for recorder in range(self.df_recorders_len):
                GLM.iloc[recorder, cluster] = GLM.iloc[recorder, cluster]/GLM.iloc[recorder, -1]
        return GLM
    
    def regions(self):
        dataframe_audios_regions = self.dataframe_audios.copy()
        dataframe_audios_regions["Region"] = 0
        for audio in range(len(dataframe_audios_regions)):   
            dataframe_audios_regions.loc[audio,"Region"] = self.dataframe_recorders.loc[dataframe_audios_regions.loc[audio, "Recorder"]].loc["Region"]
        return dataframe_audios_regions
    
    def recorders(self):
        return self.dataframe_recorders
    
    def audios(self):
        return self.dataframe_audios
    
    def clusters(self):
        return self.dataframe_clusters
    
    def save(self, root_save=None):
        if root_save != None:
            os.makedirs(f"{self.root_save}/dataframes", exist_ok=True)
            self.dataframe_recorders.to_csv(f"{self.root_save}/dataframes/dataframe_{self.cluster_names[self.n_clusters_index]}_recorders.csv")
            self.dataframe_audios.to_csv(f"{self.root_save}/dataframes/dataframe_{self.cluster_names[self.n_clusters_index]}_audios.csv") 
            
        else:
            self.dataframe_recorders.to_csv(f"{self.cluster_names[sel.n_clusters_index]}_recorders.csv")
            self.dataframe_audios.to_csv(f"{self.cluster_names[sel.n_clusters_index]}_audios.csv")
            
class df_results:
    def __init__(self, df_base, trials, df_EI = None):
        self.df_base = df_base
#         self.r, self.index = self.df_base.show_clusters(plot=False)
        self.df_EI = df_EI
        self.df_clusters_EI = df_base
        self.n_clusters = 0
        self.trials = trials
        self.colors_list = [["brown", "darkcyan", "purple", "forestgreen", "goldenrod", "dimgray", "firebrick", "lightseagreen", "darkolivegreen", "blueviolet", "darkgreen", "orangered", "deepskyblue", "indigo", "black", "darkorange"],
                            ["crimson", "firebrick", "orangered", "coral", "peachpuff", "darkslategray", "teal", "dodgerblue", "deepskyblue", "skyblue"]]
        self.colors = self.colors_list[0]
    def colors(self, palette=0):
        self.colors = self.colors_list[palette]
        return self.colors
        
    def random_color_generator(self):
        colors = []
        for i in range(n_clusters):
            random_color = random.choice(list(mcolors.CSS4_COLORS.keys()))
            while(random_color in colors):
                random_color = random.choice(list(mcolors.CSS4_COLORS.keys()))
            colors.append(random_color)
        return colors
    
    def create_EI(self):
        self.df_clusters_EI["Mean"] = self.df_EI["Mean"]
        self.df_clusters_EI["Sum"] = self.df_EI["Sum"]
        self.df_clusters_EI["Max"] = self.df_EI["Max"]
        self.df_clusters_EI["Min"] = self.df_EI["Min"]
        self.n_clusters = len(self.df_clusters_EI.iloc[0])-5
        
    def plot_bars(self, root="", title=None):
        columns = [f"Cluster {i}" for i in range(0, self.n_clusters)]
        Bar_GLM = self.df_clusters_EI[columns]
        Bar_GLM["Mean"] = self.df_EI["Mean"]
        # Bar_GLM.index.names = recorders.index
        Bar_GLM = Bar_GLM.sort_values(by=["Mean"])
        Bar_GLM2=Bar_GLM.drop(columns=["Mean"], inplace=False)
        ax=Bar_GLM2.plot(kind='bar', stacked=True, color=self.colors)
        plt.title(f"{title}")
        Bar_GLM["Mean"].plot(ax=ax, color="black")
        plt.savefig(f"{root}.pdf",format="pdf")
        
    def GLM(self,y_data="Mean"):
        GLM = self.df_clusters_EI.copy()
        X= GLM[[f"Cluster {i}" for i in range(len(GLM.columns)-6)]]
        X = X.to_numpy()
        y = GLM[y_data]
        y = y.to_numpy()
        gamma_model = sm.GLM(y, X, family=sm.families.Gamma())
        gamma_results = gamma_model.fit()
        r2 = gamma_results.pseudo_rsquared()
        return gamma_results, r2


## Feature Dataframes Organization for fair classification

In [2]:
import pandas as pd
df_ai = pd.read_csv("df_ai.csv")
df_ae_unflat = pd.read_csv("df_ae_unflat.csv")
df_vgg = pd.read_csv("df_vgg.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'df_ai.csv'

In [None]:
df_ai.set_index("y", inplace=True)
df_vgg.set_index("y", inplace=True)
df_ae_unflat.set_index("y", inplace=True)

In [None]:
indices_comunes = df_ai.index.intersection(df_ae_unflat.index)

In [None]:
df_ai = df_ai.loc[indices_comunes].sort_index()
df_ae_unflat = df_ae_unflat.loc[indices_comunes].sort_index()
df_vgg = df_vgg.loc[indices_comunes].sort_index() # -> se verifico que los índices comunes son iguales para vgg

In [None]:
df_ai.to_csv("New_df_ai.csv", index=True)

In [None]:
df_ae_unflat.to_csv("New_df_ae_unflat.csv", index=True)

In [None]:
df_vgg.to_csv("New_df_vgg.csv", index=True)

# GLMS

In [3]:
root_audios = f"{root}/Jaguas/Complementary_Files/Audios_Jaguas/Audios_Jaguas.csv"
root_recorders = f"{root}/Jaguas/Complementary_Files/df_grabadoras_reg.csv"
root_clusters = f"{root}/Jaguas/temporal/clusters"
ecological_integrity = f"{root}/Jaguas/Complementary_Files/Indice_Integridad_Ecologica.xlsx"
df_EI = pd.read_excel(ecological_integrity)
df_EI.rename(columns={"Sitio":"Recorder"}, inplace=True)
df_EI.set_index("Recorder", inplace=True)

In [5]:
dataframe = df_generator(root_clusters, root_audios, root_recorders, 
                         n_clusters_index=20)
dataframe.dataframe_audios = dataframe.dataframe_audios[dataframe.dataframe_audios.Time>"17:00"][dataframe.dataframe_audios.Time<"24:00"]
dataframe.dataframe_audios.reset_index(inplace=True)
dataframe.create_clusters_v2(count_cluster=True)
GLM = GLM = dataframe.GLM_dataframe()
GLM["Mean"] = df_EI["Mean"]
GLM["Sum"] = df_EI["Sum"]
GLM["Max"] = df_EI["Max"]
GLM["Min"] = df_EI["Min"]

  dataframe.dataframe_audios = dataframe.dataframe_audios[dataframe.dataframe_audios.Time>"17:00"][dataframe.dataframe_audios.Time<"24:00"]


In [None]:
r, index = dataframe.show_clusters("_AE_")
print(r, index)
trials_3 = [19, 26, 18, 11]
trials_5 = [2, 0, 1, 8]
trials_10 = [9, 16, 20, 7]
trials_15 = [14, 4, 22, 25]
trials = [trials_3, trials_5, trials_10, trials_15]

In [None]:
dataframe = df_generator(root_clusters, root_audios, root_recorders, 
                         n_clusters_index=20)
dataframe.dataframe_audios = dataframe.dataframe_audios[dataframe.dataframe_audios.Time>"05:00"][dataframe.dataframe_audios.Time<"08:00"]
#         dataframe.dataframe_audios = dataframe.dataframe_audios.drop(aux.index)
dataframe.dataframe_audios.reset_index(inplace=True)
dataframe.create_clusters_v2(count_cluster=True)
GLM = dataframe.GLM_dataframe()
results = df_results(GLM, trials, df_EI)
results.create_EI()
results.plot_bars()

In [None]:
%matplotlib qt
r, index = dataframe.show_clusters(plot=False)
trials_3 = [19, 26, 18, 11]
trials_5 = [2, 0, 1, 8]
trials_10 = [9, 16, 20, 7]
trials_15 = [14, 4, 22, 25]
trials = [trials_3, trials_5, trials_10, trials_15]
horas = [(5, 8), (8, 17), (17, 5)]
for trial in trials:
    for i, t in enumerate(trial):
        dataframe = df_generator(root_clusters, root_audios, root_recorders, 
                         n_clusters_index=t)
        dataframe.dataframe_audios = dataframe.dataframe_audios[dataframe.dataframe_audios.Time>"05:00"][dataframe.dataframe_audios.Time<"08:00"]
#         dataframe.dataframe_audios = dataframe.dataframe_audios.drop(aux.index)
        dataframe.dataframe_audios.reset_index(inplace=True)
        dataframe.create_clusters_v2(count_cluster=True)
        GLM = dataframe.GLM_dataframe()
        results = df_results(GLM, trials, df_EI)
        results.create_EI()
        results.plot_bars()

In [None]:
mapa = folium.Map(location=[6.3828, -75.0157], zoom_start=12)

puntos = df[['latitude_IG', 'longitud_IG', 'number_files_FI']].values.tolist()
valores = df["number_files_FI"].tolist()
HeatMap(puntos, radius=35, ).add_to(mapa)
mapa

%matplotlib qt
r, index = dataframe.show_clusters(plot=False)
trials_3 = [19, 26, 18, 11]
trials_5 = [2, 0, 1, 8]
trials_10 = [9, 16, 20, 7]
trials_15 = [14, 4, 22, 25]
trials = [trials_3, trials_5, trials_10, trials_15]
horas = [(5, 8), (8, 17), (17, 5)]
for trial in trials:
    for i, t in enumerate(trial):
        dataframe = df_generator(root_clusters, root_audios, root_recorders, 
                         n_clusters_index=t)
        dataframe.dataframe_audios = dataframe.dataframe_audios[dataframe.dataframe_audios.Time>"05:00"][dataframe.dataframe_audios.Time<"08:00"]
#         dataframe.dataframe_audios = dataframe.dataframe_audios.drop(aux.index)
        dataframe.dataframe_audios.reset_index(inplace=True)
        dataframe.create_clusters_v2(count_cluster=True)
        GLM = dataframe.GLM_dataframe()
        GLM["Mean"] = df_EI["Mean"]
        GLM["Sum"] = df_EI["Sum"]
        GLM["Max"] = df_EI["Max"]
        GLM["Min"] = df_EI["Min"]
        n_clusters = len(GLM.iloc[0])-5
        columns = [f"Cluster {i}" for i in range(0, n_clusters)]
        Bar_GLM = GLM[columns]
        Bar_GLM["Mean"] = df_EI["Mean"]
        # Bar_GLM.index.names = recorders.index
        Bar_GLM = Bar_GLM.sort_values(by=["Mean"])
        Bar_GLM2=Bar_GLM.drop(columns=["Mean"], inplace=False)
        ax=Bar_GLM2.plot(kind='bar', stacked=True, color=colors_list)
        plt.title(f"{r[trial[i]]}_dawn")
        Bar_GLM["Mean"].plot(ax=ax, color="black")
        plt.savefig(f"{root}/Jaguas/temporal/ei_results/dawn/{r[trial[i]]}_dawn.pdf",format="pdf")

In [None]:
%matplotlib qt
import matplotlib.pyplot as plt
plt.figure()
ax=Bar_GLM2.plot(kind='bar', stacked=True, color=colors_list)
Bar_GLM["Mean"].plot(ax=ax, color="black")

# Using StatsModels

In [None]:
gamma_results.pseudo_rsquared()

In [None]:
PCA_index = [27, 10, 6, 23]
UMAP_index = [3, 12, 5, 21]
TSNE_index = [29, 15, 17, 13]

In [None]:
import matplotlib.pyplot as plt
plt.plot(r2_PCA)
plt.plot(r2_UMAP)
plt.plot(r2_TSNE)
plt.legend(["PCA", "UMAP", "TSNE"])
plt.xticks((0,1,2,3),("3", "5", "10", "15"))

In [None]:
plt.plot(r2_PCA)

In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
r2_PCA = []
trials = TSNE_index;
for index in trials:
    dataframe = df_generator(root_clusters, root_audios, root_recorders, 
                         n_clusters_index=index)
    dataframe.create_clusters(count_cluster=True)
    GLM = dataframe.GLM_dataframe()
#     GLM = GLM.reset_index().set_index(np.arange(len(GLM)))
    GLM["Mean"] = df_EI["Mean"]
    GLM["Sum"] = df_EI["Sum"]
    GLM["Max"] = df_EI["Max"]
    GLM["Min"] = df_EI["Min"]
    X= GLM[[f"Cluster {i}" for i in range(len(GLM.columns)-6)]]
    X = X.to_numpy()
    y = GLM["Mean"]
    y = y.to_numpy()
    gamma_model = sm.GLM(y, X, family=sm.families.Gamma())
    gamma_results = gamma_model.fit()
    r2_PCA.append(gamma_results.pseudo_rsquared())

In [None]:
import statsmodels.api as sm
X= GLM[[f"Cluster {i}" for i in range(len(GLM.columns)-6)]]
X = X.to_numpy()
y = GLM["Mean"]
y = y.to_numpy()
gamma_model = sm.GLM(y, X, family=sm.families.Gamma())
gamma_results = gamma_model.fit()
print(gamma_results.summary())

In [None]:
import statsmodels.api as sm
data = sm.datasets.scotland.load()
data.exog = sm.add_constant(data.exog)
gamma_model = sm.GLM(data.endog, data.exog, family=sm.families.Gamma())
gamma_results = gamma_model.fit()
print(gamma_results.summary())

# SK-Learn example

In [None]:
from sklearn import linear_model
clf = linear_model.PoissonRegressor()
X = [[1, 2], [2, 3], [3, 4], [4, 3]]
y = [12, 17, 22, 21]
clf.fit(X, y)
clf.score(X, y)
