In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import folium
from folium.plugins import HeatMap, HeatMapWithTime
%matplotlib inline

In [None]:
birds_df = pd.read_csv("birds_india.csv")

In [None]:
birds_df.head()

Unnamed: 0,id,gen,sp,ssp,en,rec,cnt,loc,lat,lng,...,lic,q,length,time,date,uploaded,also,rmk,bird-seen,playback-used
0,429767,Dendrocygna,bicolor,,Fulvous Whistling Duck,Sreekumar Chirukandoth,India,"OMR - Medavakkam Toll Gate (near Chennai), Ka...",12.9011,80.2199,...,//creativecommons.org/licenses/by-nc-sa/4.0/,A,0:08,07:00,2017-08-27,2018-08-08,"['Corvus splendens', 'Tachybaptus ruficollis']",Call of a single bird flying in to join a smal...,yes,no
1,369151,Dendrocygna,javanica,,Lesser Whistling Duck,Peter Boesman,India,"Near Chouldari, South Andaman County, Andaman ...",11.63,92.67,...,//creativecommons.org/licenses/by-nc-nd/4.0/,A,0:18,15:00,2017-04-23,2017-05-12,[''],,unknown,unknown
2,369150,Dendrocygna,javanica,,Lesser Whistling Duck,Peter Boesman,India,"Near Chouldari, South Andaman County, Andaman ...",11.63,92.67,...,//creativecommons.org/licenses/by-nc-nd/4.0/,A,0:30,15:00,2017-04-23,2017-05-12,[''],,unknown,unknown
3,178554,Dendrocygna,javanica,,Lesser Whistling Duck,Eveny Luis,India,"Goncoi, Aldona,Bardez, Goa",15.5966,73.8729,...,//creativecommons.org/licenses/by-nc-nd/4.0/,A,0:06,19:17,2014-05-18,2014-05-18,[''],,yes,no
4,472687,Dendrocygna,javanica,,Lesser Whistling Duck,Peter Boesman,India,"Keoladeo National Park, Bharatpur, Rajasthan",27.1593,77.5232,...,//creativecommons.org/licenses/by-nc-nd/4.0/,B,0:27,10:30,2019-04-06,2019-05-10,['Mycteria leucocephala'],Calls of a small group,unknown,unknown


In [None]:
birds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13654 entries, 0 to 13653
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             13654 non-null  int64  
 1   gen            13654 non-null  object 
 2   sp             13654 non-null  object 
 3   ssp            1735 non-null   object 
 4   en             13654 non-null  object 
 5   rec            13654 non-null  object 
 6   cnt            13654 non-null  object 
 7   loc            13119 non-null  object 
 8   lat            12830 non-null  float64
 9   lng            12830 non-null  float64
 10  alt            11351 non-null  object 
 11  type           13654 non-null  object 
 12  url            13654 non-null  object 
 13  file           13120 non-null  object 
 14  file-name      13120 non-null  object 
 15  sono           13654 non-null  object 
 16  lic            13654 non-null  object 
 17  q              13654 non-null  object 
 18  length

In [None]:
birds_df.describe()

Unnamed: 0,id,lat,lng
count,13654.0,12830.0,12830.0
mean,310076.60539,22.0433,81.500936
std,164131.360427,7.182187,7.907365
min,18750.0,6.8224,-15.8203
25%,158066.25,15.4346,75.2479
50%,345160.5,25.422133,78.2678
75%,454333.75,27.35115,91.027
max,579836.0,86.2878,97.028


In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

missing_data(birds_df)

Unnamed: 0,id,gen,sp,ssp,en,rec,cnt,loc,lat,lng,...,lic,q,length,time,date,uploaded,also,rmk,bird-seen,playback-used
Total,0,0,0,11919,0,0,0,535,824,824,...,0,0,0,1,0,0,0,8136,0,0
Percent,0.0,0.0,0.0,87.293101,0.0,0.0,0.0,3.918266,6.034862,6.034862,...,0.0,0.0,0.0,0.007324,0.0,0.0,0.0,59.586934,0.0,0.0
Types,int64,object,object,object,object,object,object,object,float64,float64,...,object,object,object,object,object,object,object,object,object,object


In [None]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))

In [None]:
unique_values(birds_df)

Unnamed: 0,id,gen,sp,ssp,en,rec,cnt,loc,lat,lng,...,lic,q,length,time,date,uploaded,also,rmk,bird-seen,playback-used
Total,13654,13654,13654,1735,13654,13654,13654,13119,12830,12830,...,13654,13654,13654,13653,13654,13654,13654,5518,13654,13654
Uniques,13654,405,791,493,984,476,1,1995,2527,2504,...,7,6,255,1329,2685,1799,959,4646,3,3


In [None]:
def plot_count(feature, title, df, size=1):
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set3')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center")
    plt.show()

In [None]:
aggregated_df = birds_df.groupby(["lat", "lng"])["id"].count().reset_index()
aggregated_df.columns = ['lat', 'lng', 'count']

In [None]:
m = folium.Map(location=[20, 78], zoom_start=5)
max_val = max(aggregated_df['count'])
HeatMap(data=aggregated_df[['lat', 'lng', 'count']],\
        radius=15, max_zoom=12).add_to(m)
m

In [None]:
subset = birds_df.loc[birds_df["gen"]=="Phylloscopus"]
aggregated_df = subset.groupby(["lat", "lng"])["id"].count().reset_index()
aggregated_df.columns = ['lat', 'lng', 'count']
m = folium.Map(location=[20, 78], zoom_start=5)
max_val = max(aggregated_df['count'])
HeatMap(data=aggregated_df[['lat', 'lng', 'count']],\
        radius=15, max_zoom=12).add_to(m)
m

In [None]:
subset = birds_df.loc[birds_df["gen"]=="Mystery"]
aggregated_df = subset.groupby(["lat", "lng"])["id"].count().reset_index()
aggregated_df.columns = ['lat', 'lng', 'count']
m = folium.Map(location=[20, 78], zoom_start=5)
max_val = max(aggregated_df['count'])
HeatMap(data=aggregated_df[['lat', 'lng', 'count']],\
        radius=15, max_zoom=12).add_to(m)
m