# 1. Load data

In [35]:
import pandas as pd
import plotly.express as px
import geopandas as gpd

all_birds = pd.read_csv('F:\\Python_projects\\state_of_birds_india\\birds_vis\\all_birds.csv')

#### Load India shapefile

In [22]:
ind_adm1 = gpd.read_file('F:\\Python_projects\\state_of_birds_india\\birds_vis\\IND_adm1.gpkg')

In [32]:
print(set(ind_adm1['NAME_1']))
print(set(all_birds['source']))

{'Uttaranchal', 'Haryana', 'Orissa', 'Tamil Nadu', 'Gujarat', 'Meghalaya', 'Chandigarh', 'Punjab', 'Uttar Pradesh', 'Delhi', 'Maharashtra', 'Chhattisgarh', 'Mizoram', 'Rajasthan', 'Dadra and Nagar Haveli', 'Lakshadweep', 'Himachal Pradesh', 'Jammu and Kashmir', 'West Bengal', 'Goa', 'Nagaland', 'Assam', 'Tripura', 'Sikkim', 'Andaman and Nicobar', 'Puducherry', 'Jharkhand', 'Telangana', 'Kerala', 'Madhya Pradesh', 'Karnataka', 'Arunachal Pradesh', 'Daman and Diu', 'Andhra Pradesh', 'Bihar', 'Manipur'}
{'Haryana', 'Dadra_and_Nagar_Haveli', 'Bihar', 'Uttar_Pradesh', 'Gujarat', 'Jammu_and_Kashmir', 'Meghalaya', 'Chandigarh', 'Punjab', 'Himachal_Pradesh', 'Uttarakhand', 'Delhi', 'Maharashtra', 'Tamil_Nadu', 'Chhattisgarh', 'Arunachal_Pradesh', 'Mizoram', 'Ladakh', 'Rajasthan', 'Lakshadweep', 'Odisha', 'Goa', 'Nagaland', 'Assam', 'Tripura', 'Sikkim', 'Puducherry', 'Manipur', 'Jharkhand', 'Telangana', 'Kerala', 'Daman_and_Diu', 'West_Bengal', 'Andhra_Pradesh', 'Karnataka', 'Andaman_and_Nicoba

In [34]:
# Assuming ind_adm1['NAME_1'] and all_birds['source'] are sets or lists
elements_only_in_ind_adm1 = sorted(set(ind_adm1['NAME_1']).difference(set(all_birds['source'])))
elements_only_in_all_birds = sorted(set(all_birds['source']).difference(set(ind_adm1['NAME_1'])))

# Alternatively, you can use the - operator for sets
# elements_only_in_ind_adm1 = sorted(set(ind_adm1['NAME_1']) - set(all_birds['source']))
# elements_only_in_all_birds = sorted(set(all_birds['source']) - set(ind_adm1['NAME_1']))

print("Elements only in ind_adm1 (in alphabetical order):", elements_only_in_ind_adm1)
print("Elements only in all_birds (in alphabetical order):", elements_only_in_all_birds)



Elements only in ind_adm1 (in alphabetical order): ['Andaman and Nicobar', 'Andhra Pradesh', 'Arunachal Pradesh', 'Dadra and Nagar Haveli', 'Daman and Diu', 'Himachal Pradesh', 'Jammu and Kashmir', 'Madhya Pradesh', 'Orissa', 'Tamil Nadu', 'Uttar Pradesh', 'Uttaranchal', 'West Bengal']
Elements only in all_birds (in alphabetical order): ['Andaman_and_Nicobar_Islands', 'Andhra_Pradesh', 'Arunachal_Pradesh', 'Dadra_and_Nagar_Haveli', 'Daman_and_Diu', 'Himachal_Pradesh', 'Jammu_and_Kashmir', 'Ladakh', 'Madhya_Pradesh', 'Odisha', 'Tamil_Nadu', 'Uttar_Pradesh', 'Uttarakhand', 'West_Bengal']


In [26]:
print('no of states in ind_adm1= ',len(set(ind_adm1['NAME_1'])))
print('no of states in all_birds = ', len(set(all_birds['source'])))

no of states in ind_adm1=  36
no of states in all_birds =  37


# 2. Plot no. of species found in each state or union territory

In [36]:
all_birds_vis = all_birds.groupby(by='source').count().reset_index().sort_values(by='Unnamed: 0', ascending=True)

fig_type = px.bar(all_birds_vis,  x ='Unnamed: 0', y= 'source',
                title= 'State-wise bird count',
                color = 'Unnamed: 0' , color_continuous_scale=['red','black'], text='Unnamed: 0')
#Axis titles
fig_type.update_layout(xaxis_title="Species count", yaxis_title='State')
#Remove colorbar
fig_type.update_coloraxes(showscale=False)
# Set the height of the figure to scale
fig_type.update_layout(height=1000)
fig_type.show()

prev Plot function

In [None]:
def top_n(my_df, col, type, n):
    
    #my_df: this is a pandas dataframe
    #col: 
    #type: 
    
    
    #from dataframe my_df, get top n most watched names for 'type' in column 'col'. 'type' is 'movie' or 'tv show'
    
    #Get dataframe of just type
    my_type = my_df[my_df[col]==type]
    
    #Group shows by movie name, sort in descending order and convert to dataframe
    my_type = my_type.groupby('name').sum().sort_values(by='duration', ascending= False).reset_index()
    
    #Keep only relevant columns
    my_type = my_type[['name', 'duration']]
    
    #Get a dataframe of just the first n most watched shows
    top_n_type = my_type.iloc[0:n,].sort_values(by='duration', ascending=True)
    #somehow, round has to be applied again
    top_n_type ['duration']= round(top_n_type['duration'],2)
    
    fig_type = px.bar(top_n_type,  x ='duration', y= 'name' ,
                        title= 'My top '+str(n)+' '+ type.capitalize()+'s',
                        color = 'duration' , color_continuous_scale=['red','black'], text='duration')
    #Axis titles
    fig_type.update_layout(xaxis_title="No. of hours watched", yaxis_title=type.capitalize()+'s')
    #Remove colorbar
    fig_type.update_coloraxes(showscale=False)
    # Set the height of the figure to scale
    if n>=10: fig_type.update_layout(height=40*n)       
    
    
    fig_type.show()