In [1]:
if 'google.colab' in str(get_ipython()):
    import sys
    from google.colab import drive, output
    drive.mount('/content/drive')
    !pip install torchaudio
    !pip install wandb --upgrade
    output.clear()
    print("Running on colab")
    %load_ext autoreload
    %autoreload 1
    %cd '/content/drive/MyDrive/PhD_Thesis_Experiments/DeepLearning/AutoEncoders/Project'
    root = "/content/drive/MyDrive/PhD_Thesis_Experiments/DeepLearning/AutoEncoders/Project"
else:
    print("Running local")
    root = "/home/mirp_ai/Documents/Daniel_Nieto/PhD/AutoEncoders_Ecoacoustics"
    root_path = "/media/mirp_ai/Seagate Desktop Drive/Jaguas_2018"


Running local


In [2]:
import os
import numpy as np

import datetime
from datetime import timedelta

import torch
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
from torchvision.utils import make_grid
from Jaguas_DataLoader_rainless import SoundscapeData
import matplotlib.pyplot as plt
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [3]:
class df_generator():
    def __init__(self, root_clusters, root_audios, root_recorders, n_clusters_index = 0):
        self.n_clusters_index = n_clusters_index
        self.cluster_names = os.listdir(f"{root_clusters}")
        self.torch_clusters = torch.load(f"{root_clusters}/{self.cluster_names[n_clusters_index]}")
        self.dataframe_clusters = pd.DataFrame(self.torch_clusters)
        self.dataframe_clusters = self.dataframe_clusters.transpose()
        self.dataframe_audios = pd.read_csv(f"{root_audios}", index_col=0)
        self.dataframe_recorders = pd.read_csv(f"{root_recorders}",sep = ";", index_col = "Recorder")
        self.df_clusters_len = len(self.dataframe_clusters)
        self.df_recorders_len = len(self.dataframe_recorders)
        self.n_clusters = np.arange(len(self.torch_clusters))
        
        for i in self.n_clusters:
            self.dataframe_audios[f"Cluster {i}"] = 0
        self.dataframe_audios.set_index("Filename", drop=False, inplace=True)
        for i in self.n_clusters:
            counts = self.dataframe_clusters[i].value_counts()
            c_index = counts.index
            c_value = counts.values
            for j in range(len(counts)):
                self.dataframe_audios.loc[c_index[j], f"Cluster {i}"] = c_value[j]

        self.dataframe_audios["Cluster Sum"] = self.dataframe_audios.loc[:,"Cluster 0": f"Cluster {len(self.n_clusters)-1}"].sum(axis=1)
        self.dataframe_audios = self.dataframe_audios[self.dataframe_audios["Cluster Sum"]!=0]
        self.dataframe_audios.set_index(pd.Index(range(0,len(self.dataframe_audios))), inplace=True)


    def show_clusters(self, keyword=None, plot=True):
        cluster_names = []
        index = []
        if keyword != None:
            for i in range(len(self.cluster_names)):
                if keyword in self.cluster_names[i]:
                    cluster_names.append(self.cluster_names[i])
                    index.append(i)
                else:
                    pass
        else:
            for i in range(len(self.cluster_names)):
                    cluster_names.append(self.cluster_names[i])
                    index.append(i)
                    
        if plot==True:
            for i in range(len(cluster_names)):
                print(f"{index[i]}: {cluster_names[i]}")
        else:
            pass
        
        return cluster_names, index

    def init_clusters(self):
        for cluster in self.n_clusters:
            self.dataframe_recorders[f"Cluster {cluster}"] = 0
    
    def create_clusters(self, count_cluster=False):
        self.init_clusters()
        for cluster in self.n_clusters:
            for i in range(self.df_clusters_len):  
                if self.dataframe_clusters.iloc[i][cluster] != None:
                    self.dataframe_recorders.loc[self.dataframe_clusters.iloc[i][cluster].split("_")[0], f"Cluster {cluster}"] +=1
                else:
                    pass
        if count_cluster == True:
            self.count_cluster_data()
        else:
            pass
    
    def create_clusters_v2(self, count_cluster=False):
        self.init_clusters()
        for cluster in self.n_clusters:
            for i in range(len(self.dataframe_audios)): 
                self.dataframe_recorders.loc[self.dataframe_audios.loc[i, "Filename"].split("_")[0], f"Cluster {cluster}"] += self.dataframe_audios.loc[i, f"Cluster {cluster}"]
        if count_cluster == True:
            self.count_cluster_data()
        else:
            pass
    
    def count_cluster_data(self):
        self.dataframe_recorders["Total_Clustering_Audios"] = 0
        for recorder in range(self.df_recorders_len):
            clusters = [f"Cluster {i}" for i in range(0, len(self.n_clusters))]
            clusters.append("Total_Clustering_Audios")
            self.dataframe_recorders.iloc[recorder, -1] = self.dataframe_recorders[clusters].iloc[recorder].sum()
    
    def GLM_dataframe(self):
        GLM = self.dataframe_recorders.copy()
        columns = [f"Cluster {i}" for i in range(0, len(self.n_clusters))]
        columns.append("Total_Clustering_Audios")
        GLM = GLM[columns]
        for cluster in range(len(columns[0:-1])):
            for recorder in range(self.df_recorders_len):
                GLM.iloc[recorder, cluster] = GLM.iloc[recorder, cluster]/GLM.iloc[recorder, -1]
        return GLM
    
    def regions(self):
        dataframe_audios_regions = self.dataframe_audios.copy()
        dataframe_audios_regions["Region"] = 0
        for audio in range(len(dataframe_audios_regions)):   
            dataframe_audios_regions.loc[audio,"Region"] = self.dataframe_recorders.loc[dataframe_audios_regions.loc[audio, "Recorder"]].loc["Region"]
        return dataframe_audios_regions
    
    def recorders(self):
        return self.dataframe_recorders
    
    def audios(self):
        return self.dataframe_audios
    
    def clusters(self):
        return self.dataframe_clusters
    
    def save(self, root_save=None):
        if root_save != None:
            os.makedirs(f"{self.root_save}/dataframes", exist_ok=True)
            self.dataframe_recorders.to_csv(f"{self.root_save}/dataframes/dataframe_{self.cluster_names[self.n_clusters_index]}_recorders.csv")
            self.dataframe_audios.to_csv(f"{self.root_save}/dataframes/dataframe_{self.cluster_names[self.n_clusters_index]}_audios.csv") 
            
        else:
            self.dataframe_recorders.to_csv(f"{self.cluster_names[sel.n_clusters_index]}_recorders.csv")
            self.dataframe_audios.to_csv(f"{self.cluster_names[sel.n_clusters_index]}_audios.csv")

In [4]:
root_audios = f"{root}/Jaguas/Complementary_Files/Audios_Jaguas/Audios_Jaguas.csv"
root_recorders = f"{root}/Jaguas/Complementary_Files/df_grabadoras_reg.csv"
root_clusters = f"{root}/Jaguas/temporal/clusters"
ecological_integrity = f"{root}/Jaguas/Complementary_Files/Indice_Integridad_Ecologica.xlsx"
df_EI = pd.read_excel(ecological_integrity)
df_EI.rename(columns={"Sitio":"Recorder"}, inplace=True)
df_EI.set_index("Recorder", inplace=True)

In [5]:
dataframe = df_generator(root_clusters, root_audios, root_recorders, 
                         n_clusters_index=20)
dataframe.create_clusters_v2(count_cluster=True)
GLM = dataframe.GLM_dataframe()
# GLM = GLM.reset_index().set_index(np.arange(len(GLM)))
GLM["Mean"] = df_EI["Mean"]
GLM["Sum"] = df_EI["Sum"]
GLM["Max"] = df_EI["Max"]
GLM["Min"] = df_EI["Min"]
regiones = dataframe.regions()

In [8]:
dataframe = df_generator(root_clusters, root_audios, root_recorders, 
                         n_clusters_index=20)
dataframe.dataframe_audios = dataframe.dataframe_audios[dataframe.dataframe_audios.Time>"06:00"][dataframe.dataframe_audios.Time<"14:00"]
dataframe.dataframe_audios.reset_index(inplace=True)
dataframe.create_clusters_v2(count_cluster=True)
GLM = GLM = dataframe.GLM_dataframe()
GLM["Mean"] = df_EI["Mean"]
GLM["Sum"] = df_EI["Sum"]
GLM["Max"] = df_EI["Max"]
GLM["Min"] = df_EI["Min"]
GLM

  dataframe.dataframe_audios = dataframe.dataframe_audios[dataframe.dataframe_audios.Time>"06:00"][dataframe.dataframe_audios.Time<"14:00"]


Unnamed: 0_level_0,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,Cluster 8,Cluster 9,Total_Clustering_Audios
Recorder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G03,0.0,0.202198,0.046154,0.016484,0.238462,0.006593,0.112088,0.0,0.0,0.378022,910
G04,0.083333,0.065278,0.088889,0.151389,0.483333,0.0,0.054167,0.0,0.070833,0.002778,720
G06,0.0,0.002344,0.175,0.042188,0.610938,0.000781,0.030469,0.0,0.023438,0.114844,1280
G07,0.0,0.028866,0.408247,0.020619,0.339175,0.001031,0.014433,0.0,0.07732,0.110309,970
G08,0.0,0.144828,0.031527,0.032512,0.289655,0.0,0.190148,0.0,0.004926,0.306404,1015
G09,0.0,0.022472,0.092135,0.146816,0.525843,0.004494,0.012734,0.002996,0.192509,0.0,1335
G13,0.0,0.135238,0.110476,0.192381,0.104762,0.000952,0.08,0.000952,0.11619,0.259048,1050
G15,0.0,0.07027,0.069498,0.013127,0.094981,0.0,0.000772,0.0,0.017761,0.733591,1295
G17,0.0,0.158491,0.028679,0.073208,0.18566,0.0,0.256604,0.0,0.002264,0.295094,1325
G19,0.0,0.15249,0.031418,0.02069,0.255939,0.0,0.085824,0.0,0.0,0.45364,1305


In [None]:
aud = dataframe.dataframe_audios
aud

In [None]:
r, index = dataframe.show_clusters("_AE_")
print(r, index)
trials_3 = [19, 26, 18, 11]
trials_5 = [2, 0, 1, 8]
trials_10 = [9, 16, 20, 7]
trials_15 = [14, 4, 22, 25]
trials = [trials_3, trials_5, trials_10, trials_15]

In [None]:
import matplotlib.colors as mcolors
import random

colors_list = ["brown", "darkcyan", "purple", "forestgreen", "goldenrod", "dimgray", "firebrick", "lightseagreen", "darkolivegreen", "blueviolet", "darkgreen", "orangered", "deepskyblue", "indigo", "black", "darkorange"]
colors_list2 = ["crimson", "firebrick", "orangered", "coral", "peachpuff", "darkslategray", "teal", "dodgerblue", "deepskyblue", "skyblue"]
def random_color_generator():
    colors = []
    for i in range(n_clusters):
        random_color = random.choice(list(mcolors.CSS4_COLORS.keys()))
        while(random_color in colors):
            random_color = random.choice(list(mcolors.CSS4_COLORS.keys()))
        colors.append(random_color)
    return colors


    

In [None]:
%matplotlib qt
r, index = dataframe.show_clusters(plot=False)
trials_3 = [19, 26, 18, 11]
trials_5 = [2, 0, 1, 8]
trials_10 = [9, 16, 20, 7]
trials_15 = [14, 4, 22, 25]
trials = [trials_3, trials_5, trials_10, trials_15]

for trial in trials:
    for i, t in enumerate(trial):
        dataframe = df_generator(root_clusters, root_audios, root_recorders, 
                                 n_clusters_index=t)
        dataframe.create_clusters(count_cluster=True)
        GLM = dataframe.GLM_dataframe()
        # GLM = GLM.reset_index().set_index(np.arange(len(GLM)))
        GLM["Mean"] = df_EI["Mean"]
        GLM["Sum"] = df_EI["Sum"]
        GLM["Max"] = df_EI["Max"]
        GLM["Min"] = df_EI["Min"]
        n_clusters = len(GLM.iloc[0])-5
        columns = [f"Cluster {i}" for i in range(0, n_clusters)]
        Bar_GLM = GLM[columns]
        Bar_GLM["Mean"] = df_EI["Mean"]
        # Bar_GLM.index.names = recorders.index
        Bar_GLM = Bar_GLM.sort_values(by=["Mean"])
        Bar_GLM2=Bar_GLM.drop(columns=["Mean"], inplace=False)
        ax=Bar_GLM2.plot(kind='bar', stacked=True, color=colors_list)
        plt.title(r[trial[i]])
        Bar_GLM["Mean"].plot(ax=ax, color="black")
        plt.savefig(f"{root}/Jaguas/temporal/ei_results/{r[trial[i]]}.pdf",format="pdf")

In [None]:
%matplotlib qt
import matplotlib.pyplot as plt
plt.figure()
ax=Bar_GLM2.plot(kind='bar', stacked=True, color=colors_list)
Bar_GLM["Mean"].plot(ax=ax, color="black")

# Using SK-Learn

In [None]:
from sklearn import linear_model
clf = linear_model.PoissonRegressor()
clf.fit(X,y)

In [None]:
clf.score(X,y)

# Using StatsModels

In [None]:
gamma_results.pseudo_rsquared()

In [None]:
PCA_index = [27, 10, 6, 23]
UMAP_index = [3, 12, 5, 21]
TSNE_index = [29, 15, 17, 13]

In [None]:
import matplotlib.pyplot as plt
plt.plot(r2_PCA)
plt.plot(r2_UMAP)
plt.plot(r2_TSNE)
plt.legend(["PCA", "UMAP", "TSNE"])
plt.xticks((0,1,2,3),("3", "5", "10", "15"))

In [None]:
plt.plot(r2_PCA)

In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
r2_PCA = []
trials = TSNE_index;
for index in trials:
    dataframe = df_generator(root_clusters, root_audios, root_recorders, 
                         n_clusters_index=index)
    dataframe.create_clusters(count_cluster=True)
    GLM = dataframe.GLM_dataframe()
#     GLM = GLM.reset_index().set_index(np.arange(len(GLM)))
    GLM["Mean"] = df_EI["Mean"]
    GLM["Sum"] = df_EI["Sum"]
    GLM["Max"] = df_EI["Max"]
    GLM["Min"] = df_EI["Min"]
    X= GLM[[f"Cluster {i}" for i in range(len(GLM.columns)-6)]]
    X = X.to_numpy()
    y = GLM["Mean"]
    y = y.to_numpy()
    gamma_model = sm.GLM(y, X, family=sm.families.Gamma())
    gamma_results = gamma_model.fit()
    r2_PCA.append(gamma_results.pseudo_rsquared())

In [None]:
print(r2_TSNE)

In [None]:
GLM.columns

In [None]:
import statsmodels.api as sm
X= GLM[[f"Cluster {i}" for i in range(len(GLM.columns)-6)]]
X = X.to_numpy()
y = GLM["Mean"]
y = y.to_numpy()
gamma_model = sm.GLM(y, X, family=sm.families.Gamma())
gamma_results = gamma_model.fit()
print(gamma_results.summary())

In [None]:
import statsmodels.api as sm
data = sm.datasets.scotland.load()
data.exog = sm.add_constant(data.exog)
gamma_model = sm.GLM(data.endog, data.exog, family=sm.families.Gamma())
gamma_results = gamma_model.fit()
print(gamma_results.summary())

# SK-Learn example

In [None]:
from sklearn import linear_model
clf = linear_model.PoissonRegressor()
X = [[1, 2], [2, 3], [3, 4], [4, 3]]
y = [12, 17, 22, 21]
clf.fit(X, y)
clf.score(X, y)
