# 

# India Vs. Rest of the World

### Installing Labraries

In [1]:
!pip install pycountry_convert --quiet
!pip install folium -U --quiet



In [2]:
import pandas as pd

import matplotlib.pyplot as plt
import folium
import json

import requests
import requests
import urllib.parse
import pycountry_convert as pc
import altair as alt
from collections import OrderedDict
from folium import plugins
from folium import plugins
from tqdm import tqdm
from folium.plugins import HeatMap, MarkerCluster


In [3]:
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
df = df[:][1:]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
"""
    Args: country_name 
    Return: Name of the continent the country belongs to
"""

def country_to_continent(country_name):
    country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return country_continent_name

In [5]:
"""
    Args : Country Name
    Return : Tuple of Latitude and Longitude of the country
"""

def get_coordinates(country_name):
  url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(country_name) +'?format=json'  
  response = requests.get(url).json()
  if len(response) != 0:
    return float(response[0]["lat"]) ,float(response[0]["lon"])

In [6]:
"""
    Args: df :- A dataframe object of our original data
    Returns: Dataframe with 3 more columns containing 
            Latitude, loongitude and Continent of the 
            Country.
"""


def geotag_countries(df):
  continent_dict = dict()
  lat_dict = dict()
  long_dict = dict()
  for i, x in enumerate(df.groupby('Q3')):
    country = x[0]
    if country == 'Hong Kong (S.A.R.)':
      #print('enter')
      country = 'Hong Kong'
    #print(type(x), x[0])
    try:
      continent = country_to_continent(country)
    except Exception as e:
        df = df[df['Q3'] != country]
        continue
      #df = df[df]
    continent_dict[x[0]] = continent
    lat, longi = get_coordinates(country)
    lat_dict[x[0]] = lat
    long_dict[x[0]] = longi
    #coordinates_dict[x[0]] = tuple()
    # df.loc[df['Q3'] == x[0], 'continent'] = continent 
    # df.loc[df['Q3'] == x[0], ['latitude','longitude']] = get_coordinates(country)
  df['continent'] = df['Q3'].map(continent_dict)
  df['latitude'] = df['Q3'].map(lat_dict)
  df['longitude'] = df['Q3'].map(long_dict)
  #print(df['continent'].unique())


  return df
    
df = geotag_countries(df)  
#df.head()

In [7]:
"""
    Args:
        x: Value to plot on x-axis
        y: Value to plot on y-axis
        x_name: X-axis label
        title : Chart title
    Returns:
        An interative Bar Chart
"""
def get_bar_plot(x, y, x_name, title, hue= None):
  #alt.data_transformers.disable_max_rows()

  temp_df = pd.DataFrame({x_name:x,'count':y})
  temp_df = temp_df.sort_values('count',  ascending = False)
  #print(temp_df)
  return alt.Chart(temp_df, title = title).mark_bar(opacity=0.6, color='red').encode(
      x=alt.X(f'{x_name}:N',sort = '-y'),
      y='count:Q'
    ).properties(
    width=250,
    height=200
).interactive()


## Gender Stats of Data Science Community for each Contry 

In [8]:
"""
    Args:
        df: Geotagged Dataset
        interested_column: Column of Dataframe of which we to plot
        plot_func: Plotting Function
        Title: Title of the chart
        icon_name: Name of the icon to be used in Folium Popup
        icon_color: Color of the Popup Icon
    Returns:
        It Return  a MarkerCluster Map with several label on that map. 
        On clicking those labels one can find some Interactive Plot. One
        Can zoom in and zoom out in that map.
        
"""

def plot_interative_map(df, interested_column, plot_func ,  title, icon_name='info-sign', icon_color = 'red'):
  m = folium.Map([20, 78], zoom_start= 3, tiles='Cartodb dark_matter')
  mc = MarkerCluster()
  
  for i, country in enumerate(df.Q3.unique()):

    temp = df[df['Q3'] == country]  
    temp_dict = OrderedDict(temp[interested_column].value_counts())
    plt = plot_func(list(temp_dict.keys()), list(temp_dict.values()), 'gender', f'{country}{title}' )
    popup = folium.Popup(max_width = 850)
    folium.features.VegaLite(plt, height = 250, width= 380).add_to(popup)
    icon=folium.Icon(icon=icon_name, color= icon_color, prefix='fa')
    mc.add_child(folium.Marker([temp.latitude.unique()[0],temp.longitude.unique()[0]], popup= popup,tooltip="Click Me!", icon=icon  ))
    #print(country)
  m.add_child(mc)
  return m
plot_interative_map(df, interested_column = 'Q2',plot_func = get_bar_plot,title ='\'s Data Sc. Gender Stats', icon_name ="venus-mars", icon_color = 'red' )

## Stacked Bar plot and scatter Plot

In this part we have two plots 
- a scatter Plot between country and the Number of the people of each gender
- Stacked Histogram of Age group and the Number of the the people in each group.

Both of the plots are interactive plot. You can drag a selector over country of your interest on scatter plot and the histogram will show the age distribution of that particular country with gender distribution

In [9]:
"""
    This Part Plot the 
"""

alt.data_transformers.disable_max_rows()
temp = df[(df['Q2'] == 'Man') | (df['Q2'] == 'Woman') | (df['Q2'] == 'Nonbinary')]
temp = temp[['Q1', 'Q2', 'Q3']]
age_df = temp.rename(columns={'Q3':'Country','Q1':'Age', 'Q2':'Gender'})
selector = alt.selection_single(empty='all', fields=['Age'])
brush = alt.selection(type='interval')
color_scale = alt.Scale(domain=age_df.Gender.unique()
                            )

base = alt.Chart(age_df).properties(
        width=650,
        height=150
    ).add_selection(selector).add_selection(brush)

points = base.mark_point(filled=True).encode(
        x=alt.X('Country',  
                ),
        y=alt.Y('count()'
                ),
        color=alt.condition(selector,
                        'Gender:N',
                        alt.value('lightgray'),
                        scale=color_scale),
        tooltip=['Country', 'Gender','count(Gender)']     ,
    ).interactive()
hists = base.mark_bar(opacity=0.5, thickness=100).encode(
        x=alt.X('Age',
                
                ),
        y=alt.Y('count(Age)',
                stack=True,
                ),
        color=alt.Color('Gender:N',
                        scale=color_scale)
    ).transform_filter(
        selector
    ).transform_filter(
        brush
    ).interactive()
hists & points

## Scatter plot and Stacked Bar plot
Here one will find the following Two plots
- Scatter Plot between country and the Number of the people of each gender
- Stacked Histogram Showing the distribution of the gender in countries
Here you can drag a selector over the scatter plot to see the corresponding gender distribution for the selected countries.

In [10]:
brush = alt.selection(type='interval')

points = alt.Chart(age_df).properties(    width=550,
        height=250).mark_point().encode(
    x='Country',
    y='count()',
    color=alt.condition(brush, 'Gender:N', alt.value('lightgray'))
).add_selection(
    brush
).interactive()

bars = alt.Chart(age_df).mark_bar().encode(
    y='Country:N',
    color='Gender:N',
    x='count(Country):Q'
).transform_filter(
    brush
)
points|bars

## Compare India's Data Science Comunity with rest of the world
In this section  I have plotted 4 different folium Map. On each map you will find a several labels. On Clicking on those labels, you can find two maps comparing the folowwing aspect of India and the country:
 - The first Map shows Gender Comparision
 - The second Map shows Education Comparision
 - THe third Map shows Profession Comparision
 - The final map shows year of experience Comparision
The complete map inclding the Plots are interactive which means one can zoom in and zoom out in those plots and maps.

In [11]:
def plot_degree_compare(grouped_df, con1, con2, xtitle, ytitle, column):
  alt.data_transformers.disable_max_rows()
  indian_plot = alt.Chart(grouped_df.get_group(con1)[column].value_counts().reset_index(), title = con1).mark_bar(opacity=0.6, color='red').encode(
      x=alt.X('index:O', sort= '-y', axis = alt.Axis(title= xtitle)),
      y=alt.Y(column,  axis = alt.Axis(title= ytitle))
    ).properties(
    width=150,
    height=150
).interactive()
  other_country = alt.Chart(grouped_df.get_group(con2)[column].value_counts().reset_index(), title = con2 ).mark_bar(opacity=0.6, color='blue').encode(
      x=alt.X('index:O', sort= '-y',  axis = alt.Axis(title= xtitle)),
      y=alt.Y(column,  axis = alt.Axis(title= ytitle))
    ).properties(
    width=150,
    height=150
).interactive()
  return indian_plot| other_country
#plot_degree_compare(grouped_df, 'India', 'China', 'Degree', 'Counts')

In [12]:

def compare_india_with_other(df, interested_column, plot_func , tooltip, filename, icon_name='info-sign', icon_color = 'red'):
  m = folium.Map([20, 78], zoom_start= 3, tiles='Cartodb dark_matter')
  mc = MarkerCluster()
  
  for i, country in enumerate(tqdm(df.Q3.unique())):
    if country =='India'or country == "Czech Republic":
        continue
    #print(country)
    indian_df = df[(df['Q3'] == "India") | (df['Q3'] == country)]

    grouped_df = indian_df.groupby('Q3')

    temp = df[df['Q3'] == country]  
    #temp_dict = OrderedDict(temp[interested_column].value_counts())
    plt = plot_degree_compare(grouped_df, 'India', country, tooltip, 'Counts', interested_column)
    popup = folium.Popup(max_width = 850)
    folium.features.VegaLite(plt, height = 150, width= 450).add_to(popup)
    icon=folium.Icon(icon=icon_name, color= icon_color, prefix='fa')
    mc.add_child(folium.Marker([temp.latitude.unique()[0],temp.longitude.unique()[0]], popup= popup,tooltip= f"Compare {tooltip} of {country} with India", icon=icon  ))

  m.add_child(mc)
  m.save(filename)
  return m
#compare_india_with_other(df, interested_column = 'Q6',plot_func = plot_degree_compare,tooltip = "Years of Experience" ,filename = "Experience_Comparision.html", icon_name ="graduation-cap", icon_color = 'red' )

## Gender Comparision in Data science Community (India Vs. Rest of the World)

In [13]:
compare_india_with_other(df, interested_column = 'Q2',plot_func = plot_degree_compare,tooltip = "Gender Stats" ,filename = "Gender_Comparision.html", icon_name ="venus-mars", icon_color = 'red' )

100%|██████████| 63/63 [00:06<00:00,  9.53it/s]


## Education Comparision in Data science Community (India Vs. Rest of the World)

In [14]:
compare_india_with_other(df, interested_column = 'Q4',plot_func = plot_degree_compare,tooltip = "Degree" ,filename = "Degree_Comparision.html", icon_name ="graduation-cap", icon_color = 'red' )

100%|██████████| 63/63 [00:06<00:00,  9.58it/s]


## Profession Comparision in Data science Community (India Vs. Rest of the World)

In [15]:
compare_india_with_other(df, interested_column = 'Q5',plot_func = plot_degree_compare,tooltip = "Profession" ,filename = "Profession_Comparision.html", icon_name ="user-tie", icon_color = 'red' )

100%|██████████| 63/63 [00:06<00:00,  9.57it/s]


## Years of Experience Comparision in Data science Community (India Vs. Rest of the World)

In [16]:
compare_india_with_other(df, interested_column = 'Q6',plot_func = plot_degree_compare,tooltip = "Years of Experience" ,filename = "Experience_Comparision.html", icon_name ="clock", icon_color = 'red' )

100%|██████████| 63/63 [00:06<00:00,  9.57it/s]
