# Net Migration in India vs. Pakistan

In [1]:
# libraries import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
# Load dataset
df = pd.read_csv('./dataset/data_net_migration_indopak.csv')
df.head()

Unnamed: 0,Year,Pakistan,India
0,1960,0,52264
1,1961,-133982,114181
2,1962,-131607,110398
3,1963,-129202,48986
4,1964,-126712,5837


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Year      64 non-null     int64
 1   Pakistan  64 non-null     int64
 2   India     64 non-null     int64
dtypes: int64(3)
memory usage: 1.6 KB


## Data Span

- We have data from 1960 to 2023 downloaded from WOLRD BANK.
- Data is about net migration rate in India and Pakistan.
- We will compare the net migration rate in India and Pakistan.

> **Net migration:** 
> Net Migration is the difference between the number of people immigrating to a location and the number of people emigrating from it over a specific period. It is a measure used to understand population change due to migration.

The formula for Net Migration (NM) is:

$$
\text{Net Migration} = \text{Number of Immigrants} - \text{Number of Emigrants}
$$

Where:

- `Immigrants` are the people entering a region (moving into the area).
- `Emigrants are` the people leaving a region (moving out of the area).

The formula for Net Migration Rate (NMR) is:

$$
\text{Net Migration Rate (per 1,000)} = \frac{\text{Net Migration}}{\text{Total Population}} \times 1,000
$$

We do not have the data of total population in India and Pakistan, so we will used API to get the data of total population in India and Pakistan.

In [None]:
# !pip install wbdata --quiet

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
moviepy 1.0.3 requires decorator<5.0,>=4.0.2, but you have decorator 5.1.1 which is incompatible.


In [12]:
import wbdata
import pandas as pd

# Define the indicator for population (SP.POP.TOTL)
indicator = {'SP.POP.TOTL': 'total_population'}

# Define countries (India and Pakistan)
countries = ['IN', 'PK']  # 'IN' for India, 'PK' for Pakistan

# Fetch data
data = wbdata.get_dataframe(indicator, country=countries)

# Reset index to convert it into a DataFrame
data.reset_index(inplace=True)

# Rename columns for clarity
data.rename(columns={'country': 'Country', 'date': 'Year'}, inplace=True)

# Ensure Year column is numeric
data['Year'] = pd.to_numeric(data['Year'])

# Filter data between 1960 and 2023
data = data[(data['Year'] >= 1960) & (data['Year'] <= 2023)]

# Save to a CSV file (optional)
data.to_csv("./dataset/population_india_pakistan.csv", index=False)

# Display the first few rows of data
print(data.head())


  Country  Year  total_population
0   India  2023      1.428628e+09
1   India  2022      1.417173e+09
2   India  2021      1.407564e+09
3   India  2020      1.396387e+09
4   India  2019      1.383112e+09


### now we will combined tow above requirements and make a dataset only using API of WB.

In [13]:
import wbdata
import pandas as pd

# Define the indicator for population (SP.POP.TOTL)
indicator = {
    'SP.POP.TOTL': 'total_population', # Population 
    'SM.POP.NETM': 'net_migration'      # Net Migration
             }

# Define countries (India and Pakistan)
countries = ['IN', 'PK']  # 'IN' for India, 'PK' for Pakistan

# Fetch data
data = wbdata.get_dataframe(indicator, country=countries)

# Reset index to convert it into a DataFrame
data.reset_index(inplace=True)

# Rename columns for clarity
data.rename(columns={'country': 'Country', 'date': 'Year'}, inplace=True)

# Ensure Year column is numeric
data['Year'] = pd.to_numeric(data['Year'])

# Filter data between 1960 and 2023
data = data[(data['Year'] >= 1960) & (data['Year'] <= 2023)]

# Save to a CSV file (optional)
data.to_csv("./dataset/pop_net_migration_india_pakistan.csv", index=False)

# Display the first few rows of data
print(data.head())


  Country  Year  total_population  net_migration
0   India  2023      1.428628e+09      -486136.0
1   India  2022      1.417173e+09      -487303.0
2   India  2021      1.407564e+09      -301970.0
3   India  2020      1.396387e+09       -34772.0
4   India  2019      1.383112e+09      -593495.0


In [14]:
df = data.copy()

In [15]:
df.info()

<class 'wbdata.client.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country           128 non-null    object 
 1   Year              128 non-null    int64  
 2   total_population  128 non-null    float64
 3   net_migration     128 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 4.1+ KB


In [16]:
import wbdata
import pandas as pd

# Define the indicator for population (SP.POP.TOTL)
indicator = {
    'SP.POP.TOTL': 'total_population', # Population 
    'SM.POP.NETM': 'net_migration'      # Net Migration
             }

# Define countries (India and Pakistan)
countries = ['IN', 'PK', 'BD', 'LK', 'AF']  # 'IN' for India, 'PK' for Pakistan

# Fetch data
data = wbdata.get_dataframe(indicator, country=countries)

# Reset index to convert it into a DataFrame
data.reset_index(inplace=True)

# Rename columns for clarity
data.rename(columns={'country': 'Country', 'date': 'Year'}, inplace=True)

# Ensure Year column is numeric
data['Year'] = pd.to_numeric(data['Year'])

# Filter data between 1960 and 2023
data = data[(data['Year'] >= 1960) & (data['Year'] <= 2023)]

# Save to a CSV file (optional)
data.to_csv("./dataset/pop_net_migration.csv", index=False)

# Display the first few rows of data
print(data.head())


       Country  Year  total_population  net_migration
0  Afghanistan  2023        42239854.0       -65846.0
1  Afghanistan  2022        41128771.0       -65846.0
2  Afghanistan  2021        40099462.0      -183672.0
3  Afghanistan  2020        38972230.0       166821.0
4  Afghanistan  2019        37769499.0        -8082.0


In [17]:
df = data.copy()
df.info()

<class 'wbdata.client.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country           320 non-null    object 
 1   Year              320 non-null    int64  
 2   total_population  320 non-null    float64
 3   net_migration     320 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 10.1+ KB


In [20]:
df['Country'].unique()

array(['Afghanistan', 'Bangladesh', 'India', 'Sri Lanka', 'Pakistan'],
      dtype=object)

In [21]:
df['Country'].value_counts()

Country
Afghanistan    64
Bangladesh     64
India          64
Sri Lanka      64
Pakistan       64
Name: count, dtype: int64

In [22]:
df.head()

Unnamed: 0,Country,Year,total_population,net_migration
0,Afghanistan,2023,42239854.0,-65846.0
1,Afghanistan,2022,41128771.0,-65846.0
2,Afghanistan,2021,40099462.0,-183672.0
3,Afghanistan,2020,38972230.0,166821.0
4,Afghanistan,2019,37769499.0,-8082.0


In [23]:
df['Year'].unique()

array([2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013,
       2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002,
       2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991,
       1990, 1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982, 1981, 1980,
       1979, 1978, 1977, 1976, 1975, 1974, 1973, 1972, 1971, 1970, 1969,
       1968, 1967, 1966, 1965, 1964, 1963, 1962, 1961, 1960])

In [24]:
df[['total_population', 'net_migration']].describe()

Unnamed: 0,total_population,net_migration
count,320.0,320.0
mean,238142200.0,-174649.8
std,370873500.0,437256.2
min,8622466.0,-2290411.0
25%,18884300.0,-287381.2
50%,72699460.0,-84559.5
75%,182926400.0,-840.5
max,1428628000.0,1834556.0


In [29]:
# plotly for histogram
fig = px.histogram(df, x='total_population', 
                   nbins=50, 
                   title='Population Distribution')
fig.show()

In [28]:
# plotly for histogram
fig = px.histogram(df, x='net_migration', 
                   nbins=50, 
                   title='Population Distribution')
fig.show()

In [32]:
# Line plot using years on x-axis and total population on y-axis
fig = px.line(df, x='Year', 
              y='total_population', 
              color='Country', 
              title='Total Population Over Time')
# figure size
fig.update_layout(width=800, height=400)
fig.show()

In [33]:
# Line plot using years on x-axis and total population on y-axis
fig = px.line(df, x='Year', 
              y='total_population', 
              color='Country', 
              log_y=True,
              title='Total Population Over Time')
# figure size
fig.update_layout(width=800, height=400)
fig.show()

In [36]:
# Line plot using years on x-axis and total population on y-axis
fig = px.line(df, x='Year', 
              y='total_population', 
              color='Country', 
              facet_row='Country',  # Separate plots by country
              title='Total Population Over Time')
# figure size
fig.update_layout(width=800, height=1200)
fig.show()

In [37]:
from plotly.subplots import make_subplots

# Create subplots
fig = make_subplots(rows=3, cols=2, 
                    subplot_titles=df['Country'].unique(),
                    shared_xaxes=True, 
                    shared_yaxes=True)

# Add traces for each country
countries = df['Country'].unique()
for i, country in enumerate(countries):
    row = i // 2 + 1
    col = i % 2 + 1
    country_data = df[df['Country'] == country]
    fig.add_trace(px.line(country_data, x='Year', y='total_population').data[0], row=row, col=col)

# Update layout
fig.update_layout(height=1200, width=800, title_text="Total Population Over Time")

# Show plot
fig.show()

In [43]:
# scatter plot using plotly express
fig = px.scatter(df, x='total_population', 
                 y='net_migration', 
                 color='Country', 
                 title='Population vs. Net Migration')
fig.show()

In [None]:
# sort data by year
df.sort_values('Year', inplace=True)
# scatter plot using plotly express
fig = px.scatter(df, x='total_population', 
                 y='net_migration', 
                 color='Country', 
                 animation_frame='Year',
                 size='total_population',
                 title='Population vs. Net Migration')
fig.show()

In [48]:
# sort data by year
df.sort_values('Year', inplace=True)
# Line plot
fig = px.line(df, x='Year', 
              y='net_migration', 
              color='Country', 
            #   animation_frame='Year',
              title='Net Migration Over Time')
# figure size
fig.update_layout(width=800, height=400)

In [53]:
# find the top coutnries with highest net migration
# sum the net migration for each country for all the years
top_countries = df.groupby('Country')['net_migration'].sum().sort_values(ascending=False)
top_countries.head()

# plot the data
fig = px.bar(top_countries, 
             x=top_countries.index, 
             y='net_migration', 
             title='Total Net Migration by Country')
fig.show()

In [58]:
# draw the plot to see the population of the countries in 2023 only
fig = px.bar(df[df['Year'] == 1971], 
             x='Country', 
             y='total_population', 
             color='Country',
             title='Population of Countries in 2023')
fig.show()

In [4]:
import wbdata
import pandas as pd
import plotly.graph_objects as go

# Load the data
indicator = {
    'SP.POP.TOTL': 'total_population',  # Total population
    'SM.POP.NETM': 'net_migration'     # Net Migration
}

# Fetch data for all countries
data = wbdata.get_dataframe(indicator)
data.reset_index(inplace=True)

# Rename columns for clarity
data.rename(columns={'country': 'Country', 'date': 'Year'}, inplace=True)

# Ensure the 'Year' column is numeric
data['Year'] = pd.to_numeric(data['Year'])

# Filter data between 1960 and 2023
data = data[(data['Year'] >= 1960) & (data['Year'] <= 2023)]

# List of countries and years
countries = data['Country'].unique()
years = sorted(data['Year'].unique())

# Initialize figure
fig = go.Figure()

# Add traces for each country
for country in countries:
    country_data = data[data['Country'] == country]
    
    # Line for Total Population
    fig.add_trace(go.Scatter(
        x=country_data['Year'],
        y=country_data['total_population'],
        mode='lines',
        name=f'{country} - Total Population',
        visible=False  # Initially hidden
    ))
    
    # Line for Net Migration
    fig.add_trace(go.Scatter(
        x=country_data['Year'],
        y=country_data['net_migration'],
        mode='lines',
        name=f'{country} - Net Migration',
        visible=False  # Initially hidden
    ))

# Create dropdown options for countries
dropdown_buttons = []
for i, country in enumerate(countries):
    dropdown_buttons.append(
        dict(
            method="update",
            label=country,
            args=[
                # Update 'visible' property of each trace
                {'visible': [j // 2 == i for j in range(len(countries) * 2)]},
                {'title': f"Population and Migration Trends for {country}"}
            ]
        )
    )

# Add dropdown for selecting countries
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=dropdown_buttons,
            x=0.1,
            y=1.15,
            xanchor='left',
            yanchor='top',
            showactive=True,
            direction='down',
        )
    ]
)

# Add slider for year range
fig.update_layout(
    sliders=[
        dict(
            active=0,
            currentvalue={"prefix": "Year: "},
            pad={"t": 50},
            steps=[
                dict(
                    label=str(year),
                    method="update",
                    args=[
                        {
                            'xaxis.range': [year, max(years)],
                            'title': f"Population and Migration Trends (From {year})"
                        }
                    ],
                )
                for year in years
            ],
        )
    ]
)

# Make the first country's data visible by default
for j in range(2):
    fig.data[j].visible = True

fig.update_layout(
    title="Population and Migration Trends",
    xaxis_title="Year",
    yaxis_title="Values",
    template="plotly_white"
)

fig.show()

# save into html file
fig.write_html('./html_output/population_migration_trends.html')
