In [220]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

In [221]:
frame = pd.read_csv('for_data_science_newline_fixed.csv', usecols = ["IN FILE","CLIP LENGTH","OFFSET","DURATION"])
frame.head()

Unnamed: 0,IN FILE,CLIP LENGTH,OFFSET,DURATION
0,Aburria-aburri-257423.wav,24.6335,2.3614,3.3205
1,Aburria-aburri-257423.wav,24.6335,6.732,2.0103
2,Aburria-aburri-257423.wav,24.6335,19.2438,3.4505
3,Aburria-aburri-387212.wav,84.48,0.6011,3.08
4,Aburria-aburri-387212.wav,84.48,77.6811,3.14


In [222]:
def species(row):
    species = row['IN FILE']
    spec = species.split('-')
    # print(spec)
    # print(spec[:-1])
    spec_to = '-'.join(spec[:-2])
    return spec_to

frame['Genus'] = frame.apply (lambda row: species(row), axis=1)
# frame = frame[frame['Genus']== 'Piranga-leucoptera']
frame

Unnamed: 0,IN FILE,CLIP LENGTH,OFFSET,DURATION,Genus
0,Aburria-aburri-257423.wav,24.6335,2.3614,3.3205,Aburria
1,Aburria-aburri-257423.wav,24.6335,6.7320,2.0103,Aburria
2,Aburria-aburri-257423.wav,24.6335,19.2438,3.4505,Aburria
3,Aburria-aburri-387212.wav,84.4800,0.6011,3.0800,Aburria
4,Aburria-aburri-387212.wav,84.4800,77.6811,3.1400,Aburria
...,...,...,...,...,...
31360,Zonotrichia-capensis-232609.wav,98.8560,63.2772,1.8301,Zonotrichia
31361,Zonotrichia-capensis-232609.wav,98.8560,34.1554,2.4702,Zonotrichia
31362,Zonotrichia-capensis-377483.wav,18.2596,0.5762,3.2199,Zonotrichia
31363,Zonotrichia-capensis-377483.wav,18.2596,6.8160,2.1500,Zonotrichia


In [223]:
# Function copy-pasted directly from PyHa
def annotation_duration_statistics(df):
    """
    Function that calculates basic statistics related to the duration of
    annotations of a Pandas Dataframe compatible with PyHa.
    Args:
        df (Pandas Dataframe)
            - Automated labels or manual labels.
    Returns:
        Pandas Dataframe containing count, mean, mode, standard deviation, and
        IQR values based on annotation duration.
    """
    # Reading in the Duration column of the passed in dataframe as a Python
    # list
    annotation_lengths = df["DURATION"].to_list()
    # converting to numpy array which has more readily available statistics
    # functions
    annotation_lengths = np.asarray(annotation_lengths)
    # Converting the Python list to a numpy array
    entry = {'COUNT': np.shape(annotation_lengths)[0],
             'MODE': stats.mode(np.round(annotation_lengths, 2))[0][0],
             'MEAN': np.mean(annotation_lengths),
             'STANDARD DEVIATION': np.std(annotation_lengths),
             'MIN': np.amin(annotation_lengths),
             'Q1': np.percentile(annotation_lengths, 25),
             'MEDIAN': np.median(annotation_lengths),
             'Q3': np.percentile(annotation_lengths, 75),
             'MAX': np.amax(annotation_lengths)}
    # returning the dictionary as a pandas dataframe
    return pd.DataFrame.from_dict([entry])

In [224]:
def annotation_duration_histogram(
    annotation_df,
    n_bins = 6,
    min_length = None,
    max_length = None,
    save_fig = False,
    title = "Annotation Length Histogram",
    filename = "annotation_histogram.png"):
    """
    Function to build a histogram so a user can visually see the length of 
    the annotations they are working with. 
    Args:
        annotation_df (Dataframe)
            - Dataframe of automated or human labels
        n_bins (int)
            - number of histogram bins in the final histogram
            - default: 6
            
        min_length (int)
            - minimum length of the audio clip
            - default: 0s
        max_length (int)
            - maximum length of the audio clip
            - default: 60s
        save_fig (boolean)
            - Whether or not the histogram should be saved as a file.
            - default: False
        filename (string)
            - Name of the file to save the histogram to.
            - default: "annotation_histogram.png"
    Returns:
        Histogram of the length of the annotations.
    """
    # Create the initial histogram
    duration = annotation_df["DURATION"].to_list()
    sns_hist = sns.histplot(
        data=duration,
        bins=n_bins,
        line_kws=dict(edgecolor="k", linewidth=2),
        stat="count")

    # Modify the length of the x-axis as specified
    if max_length is not None and min_length is not None:
        if max_length < min_length:
            raise ValueError("max_length cannot be less than `min_length")
        plt.xlim(min_length, max_length)
    elif max_length is not None:
        plt.xlim(right=max_length)
    elif min_length is not None:
        plt.xlim(left=min_length)

    # Set title and the labels
    sns_hist.set_title(title)
    sns_hist.set(xlabel="Annotation Length (s)", ylabel = "Count")

    # Save the histogram if specified
    if save_fig: 
        sns_hist.get_figure().savefig(filename)

In [225]:
# frame = frame[frame['IN FILE'] == 'Aburria-aburri-257423.wav']
# frame

In [226]:
# annotation_duration_statistics(frame)

#Q1
#Median
#Q3

In [227]:
# annotation_duration_histogram(frame,n_bins = 20,min_length=0.0,max_length=5.0)

In [228]:
IQR_DURATION = frame.groupby('IN FILE')[['DURATION']].quantile([.25,.5,.75])
IQR_DURATION = IQR_DURATION.unstack(1)
IQR_DURATION = IQR_DURATION['DURATION'].rename_axis([None], axis=1).reset_index()
IQR_DURATION

Unnamed: 0,IN FILE,0.25,0.5,0.75
0,Aburria-aburri-257423.wav,2.665400,3.32050,3.38550
1,Aburria-aburri-387212.wav,3.110000,3.14000,3.17000
2,Accipiter-bicolor-451839.wav,0.202975,0.21550,0.22150
3,Accipiter-collaris-260335.wav,0.895100,1.03010,1.08760
4,Accipiter-collaris-260336.wav,0.515075,0.57010,0.67000
...,...,...,...,...
2438,Zimmerius-gracilipes-258668.wav,0.795775,0.81325,0.84185
2439,Zimmerius-gracilipes-258669.wav,0.470000,0.52000,0.55000
2440,Zonotrichia-capensis-232609.wav,1.480000,1.70010,1.92010
2441,Zonotrichia-capensis-377483.wav,2.254950,2.35990,2.78990


In [229]:
IQR_OFFSET = frame.groupby('IN FILE')[['OFFSET']].quantile([.25,.5,.75])
IQR_OFFSET = IQR_OFFSET.unstack(1)
IQR_OFFSET = IQR_OFFSET['OFFSET'].rename_axis([None], axis=1).reset_index()
IQR_OFFSET

Unnamed: 0,IN FILE,0.25,0.5,0.75
0,Aburria-aburri-257423.wav,4.546700,6.73200,12.987900
1,Aburria-aburri-387212.wav,15.521100,30.44110,54.061100
2,Accipiter-bicolor-451839.wav,1.754000,2.75345,3.760200
3,Accipiter-collaris-260335.wav,6.441750,17.87295,29.574150
4,Accipiter-collaris-260336.wav,5.994075,8.05175,18.697625
...,...,...,...,...
2438,Zimmerius-gracilipes-258668.wav,2.885275,4.33075,6.495800
2439,Zimmerius-gracilipes-258669.wav,7.280900,13.83070,19.740500
2440,Zonotrichia-capensis-232609.wav,23.454800,45.46610,63.277200
2441,Zonotrichia-capensis-377483.wav,3.696100,6.81600,9.885950


In [230]:
# from mpl_toolkits import mplot3d

In [231]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [232]:
# ax = plt.axes(projection='3d')

# zdata = IQR_DURATION[0.75].to_list()
# xdata = IQR_DURATION[0.5].to_list()
# ydata = IQR_DURATION[0.25].to_list()

# ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='viridis')
# plt.show()

In [233]:
# ax = plt.axes(projection='3d')

# zdata_OFFSET = IQR_OFFSET[0.75].to_list()
# xdata_OFFSET = IQR_OFFSET[0.5].to_list()
# ydata_OFFSET = IQR_OFFSET[0.25].to_list()

# ax.scatter3D(xdata_OFFSET, ydata_OFFSET, zdata_OFFSET, c=zdata_OFFSET, cmap='viridis')
# plt.show()

In [234]:
import plotly.express as px
df = IQR_DURATION
zdata = IQR_DURATION[0.75].to_list()
xdata = IQR_DURATION[0.5].to_list()
ydata = IQR_DURATION[0.25].to_list()

# fig = px.scatter_3d(df, x= .5, y= .25, z= .75,
#                     color= 'IN FILE')
# fig.show()

In [235]:
df = IQR_OFFSET
zdata_OFFSET = IQR_OFFSET[0.75].to_list()
xdata_OFFSET = IQR_OFFSET[0.5].to_list()
ydata_OFFSET = IQR_OFFSET[0.25].to_list()

# fig = px.scatter_3d(df, x= .5, y= .25, z= .75,
#                     color= 'IN FILE')
# fig.show()

In [236]:
def species(row):
    species = row['IN FILE']
    spec = species.split('-')
    # print(spec)
    # print(spec[:-1])
    spec_to = '-'.join(spec[:-2])
    return spec_to

# change dataframe to IQR_DURATION for getting plots for IQR_DURATION

IQR_OFFSET['Genus'] = IQR_OFFSET.apply (lambda row: species(row), axis=1)
len(IQR_OFFSET['Genus'].unique())

515

In [237]:
pca_data = IQR_OFFSET
X = pca_data.drop(columns = ['IN FILE','Genus'])
X.head()

Unnamed: 0,0.25,0.5,0.75
0,4.5467,6.732,12.9879
1,15.5211,30.4411,54.0611
2,1.754,2.75345,3.7602
3,6.44175,17.87295,29.57415
4,5.994075,8.05175,18.697625


In [238]:
Y = np.array(pca_data['Genus'])
Y[:10]

array(['Aburria', 'Aburria', 'Accipiter', 'Accipiter', 'Accipiter',
       'Accipiter', 'Accipiter', 'Accipiter', 'Accipiter', 'Accipiter'],
      dtype=object)

In [239]:
from sklearn.preprocessing import StandardScaler

In [240]:
scaler = StandardScaler(with_mean = False)
scaler.fit(X)
X_standardized = scaler.transform(X)
X_standard = pd.DataFrame(X_standardized, columns = X.columns)
X_standard


Feature names only support names that are all strings. Got feature names with dtypes: ['float']. An error will be raised in 1.2.


Feature names only support names that are all strings. Got feature names with dtypes: ['float']. An error will be raised in 1.2.



Unnamed: 0,0.25,0.5,0.75
0,0.429792,0.350780,0.479345
1,1.467184,1.586176,1.995235
2,0.165803,0.143472,0.138778
3,0.608928,0.931295,1.091494
4,0.566610,0.419548,0.690074
...,...,...,...
2438,0.272740,0.225660,0.239741
2439,0.688252,0.720668,0.728563
2440,2.217144,2.369074,2.335374
2441,0.349386,0.355157,0.364861


In [241]:
from sklearn.decomposition import PCA

In [242]:
pca = PCA(n_components = 3, random_state = 0)
pca.fit(X_standardized)
X_dim_reducted = pca.transform(X_standardized)
X_dim_reducted
result = pd.DataFrame(X_dim_reducted, columns = ['PCA1','PCA2','PCA3']) #

In [243]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [244]:
import plotly.express as px

In [245]:
fig = px.scatter(result, x='PCA1', y='PCA2',color=Y)
fig.show()

# fig = px.scatter_3d(result, x='PCA1', y='PCA2',z = 'PCA3',color=Y)
# fig.show()

In [246]:
from sklearn.manifold import TSNE

In [247]:
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(X_standardized) 


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 2443 samples in 0.004s...
[t-SNE] Computed neighbors for 2443 samples in 0.116s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2443
[t-SNE] Computed conditional probabilities for sample 2000 / 2443
[t-SNE] Computed conditional probabilities for sample 2443 / 2443
[t-SNE] Mean sigma: 0.043289
[t-SNE] KL divergence after 250 iterations with early exaggeration: 58.977364
[t-SNE] KL divergence after 300 iterations: 0.723199


In [248]:
tsneed = pd.DataFrame(tsne_results, columns = ['TSNE1','TSNE2','TSNE3']) # 
tsneed

Unnamed: 0,TSNE1,TSNE2,TSNE3
0,-3.805253,3.285165,-3.716061
1,5.786346,-4.224435,2.445219
2,2.599672,-5.072584,-4.209442
3,-2.233670,-1.605731,2.702690
4,-1.887503,4.958175,0.402364
...,...,...,...
2438,-3.534248,-3.987530,-5.202440
2439,-5.738752,3.048675,3.332662
2440,5.932861,1.155178,0.186091
2441,-1.824432,2.522704,-7.765113


In [249]:
Y

array(['Aburria', 'Aburria', 'Accipiter', ..., 'Zonotrichia',
       'Zonotrichia', 'Zonotrichia'], dtype=object)

In [250]:
# plt.figure(figsize=(16,10))

# sns.scatterplot(x="TSNE1", y="TSNE2",hue = Y,palette=sns.color_palette("hls", 515),data=tsneed,legend = False,alpha=0.3) #legend = "Full"
# plt.show()

In [251]:
import plotly.express as px

fig = px.scatter_3d(tsneed, x="TSNE1", y="TSNE2", z='TSNE3',color= Y)
fig.show()