In [4]:
!pip install pandas
import pandas as pd



# Loading the data

In [5]:
data = {
    "City": ["Tokyo", "Delhi", "Shanghai", "São Paulo", "New York"],
    "Country": ["Japan", "India", "China", "Brazil", "USA"],
    "Continent": ["Asia", "Asia", "Asia", "South America", "North America"],
    "Population (2024)": [37393000, 31870000, 26752000, 22490000, 18710000],
    "Population (2023)": [37400000, 31780000, 26800000, 22380000, 18620000],
    "Growth Rate": [-0.018, 0.283, -0.179, 0.491, 0.483],
}

# Convert to a DataFrame
df = pd.DataFrame(data)
print(df)


        City Country      Continent  Population (2024)  Population (2023)  \
0      Tokyo   Japan           Asia           37393000           37400000   
1      Delhi   India           Asia           31870000           31780000   
2   Shanghai   China           Asia           26752000           26800000   
3  São Paulo  Brazil  South America           22490000           22380000   
4   New York     USA  North America           18710000           18620000   

   Growth Rate  
0       -0.018  
1        0.283  
2       -0.179  
3        0.491  
4        0.483  


# Data exploration

In [6]:
print(df.head())

        City Country      Continent  Population (2024)  Population (2023)  \
0      Tokyo   Japan           Asia           37393000           37400000   
1      Delhi   India           Asia           31870000           31780000   
2   Shanghai   China           Asia           26752000           26800000   
3  São Paulo  Brazil  South America           22490000           22380000   
4   New York     USA  North America           18710000           18620000   

   Growth Rate  
0       -0.018  
1        0.283  
2       -0.179  
3        0.491  
4        0.483  


In [7]:
print(df.describe())

       Population (2024)  Population (2023)  Growth Rate
count       5.000000e+00       5.000000e+00     5.000000
mean        2.744300e+07       2.739600e+07     0.212000
std         7.414105e+06       7.447032e+06     0.300875
min         1.871000e+07       1.862000e+07    -0.179000
25%         2.249000e+07       2.238000e+07    -0.018000
50%         2.675200e+07       2.680000e+07     0.283000
75%         3.187000e+07       3.178000e+07     0.483000
max         3.739300e+07       3.740000e+07     0.491000


In [8]:
print(df.dtypes)

City                  object
Country               object
Continent             object
Population (2024)      int64
Population (2023)      int64
Growth Rate          float64
dtype: object


# Cleaning the Data

In [9]:
df.rename(columns={"Population (2024)": "Pop_2024", "Population (2023)": "Pop_2023"}, inplace=True)
print(df.columns)

Index(['City', 'Country', 'Continent', 'Pop_2024', 'Pop_2023', 'Growth Rate'], dtype='object')


Handling Missing Values

In [10]:
# Fill missing values with 0
df.fillna(0, inplace=True)

# Drop rows with missing values
df.dropna(inplace=True)


Transforming the Data

In [11]:
df["Population Difference"] = df["Pop_2024"] - df["Pop_2023"]
print(df)

        City Country      Continent  Pop_2024  Pop_2023  Growth Rate  \
0      Tokyo   Japan           Asia  37393000  37400000       -0.018   
1      Delhi   India           Asia  31870000  31780000        0.283   
2   Shanghai   China           Asia  26752000  26800000       -0.179   
3  São Paulo  Brazil  South America  22490000  22380000        0.491   
4   New York     USA  North America  18710000  18620000        0.483   

   Population Difference  
0                  -7000  
1                  90000  
2                 -48000  
3                 110000  
4                  90000  


In [13]:
positive_growth = df[df["Growth Rate"] > 0]
print(positive_growth)

        City Country      Continent  Pop_2024  Pop_2023  Growth Rate  \
1      Delhi   India           Asia  31870000  31780000        0.283   
3  São Paulo  Brazil  South America  22490000  22380000        0.491   
4   New York     USA  North America  18710000  18620000        0.483   

   Population Difference  
1                  90000  
3                 110000  
4                  90000  


In [14]:
df_sorted = df.sort_values(by="Pop_2024", ascending=False)
print(df_sorted)

        City Country      Continent  Pop_2024  Pop_2023  Growth Rate  \
0      Tokyo   Japan           Asia  37393000  37400000       -0.018   
1      Delhi   India           Asia  31870000  31780000        0.283   
2   Shanghai   China           Asia  26752000  26800000       -0.179   
3  São Paulo  Brazil  South America  22490000  22380000        0.491   
4   New York     USA  North America  18710000  18620000        0.483   

   Population Difference  
0                  -7000  
1                  90000  
2                 -48000  
3                 110000  
4                  90000  


# Aggregating Data

In [15]:
continent_population = df.groupby("Continent")["Pop_2024"].sum()
print(continent_population)


Continent
Asia             96015000
North America    18710000
South America    22490000
Name: Pop_2024, dtype: int64


Calculate Multiple Aggregations

In [16]:
aggregations = df.groupby("Continent").agg({
    "Pop_2024": ["sum", "mean"],
    "Growth Rate": ["mean", "max"]
})
print(aggregations)


               Pop_2024             Growth Rate       
                    sum        mean        mean    max
Continent                                             
Asia           96015000  32005000.0    0.028667  0.283
North America  18710000  18710000.0    0.483000  0.483
South America  22490000  22490000.0    0.491000  0.491


# Merging and Joining Data

In [17]:
gdp_data = {
    "Country": ["Japan", "India", "China", "Brazil", "USA"],
    "GDP (2024)": [5000, 3200, 14200, 2400, 23000],
}

gdp_df = pd.DataFrame(gdp_data)

# Merge based on the "Country" column
merged_df = pd.merge(df, gdp_df, on="Country")
print(merged_df)


        City Country      Continent  Pop_2024  Pop_2023  Growth Rate  \
0      Tokyo   Japan           Asia  37393000  37400000       -0.018   
1      Delhi   India           Asia  31870000  31780000        0.283   
2   Shanghai   China           Asia  26752000  26800000       -0.179   
3  São Paulo  Brazil  South America  22490000  22380000        0.491   
4   New York     USA  North America  18710000  18620000        0.483   

   Population Difference  GDP (2024)  
0                  -7000        5000  
1                  90000        3200  
2                 -48000       14200  
3                 110000        2400  
4                  90000       23000  


# Exporting the Data

In [18]:
df.to_csv("processed_data.csv", index=False)