# Data analysis and preparation

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [64]:
source = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
tables = pd.read_html(source, thousands=".", decimal=",")
for i, table in enumerate(tables):
    print(f"\nTable {i}:")
    print(table.head())


Table 0:
                                                   0
0  Largest economies in the world by GDP (nominal...

Table 1:
                                                   0  \
0  > $20 trillion $10–20 trillion $5–10 trillion ...   

                                                   1  \
0  $750 billion – $1 trillion $500–750 billion $2...   

                                                   2  
0  $50–100 billion $25–50 billion $5–25 billion <...  

Table 2:
  Country/Territory   IMF[1][13]            World Bank[14]             \
  Country/Territory     Forecast       Year       Estimate       Year   
0             World  109,529,216       2024    105,435,540       2023   
1     United States   28,781,083       2024     27,360,935       2023   
2             China   18,532,633  [n 1]2024     17,794,782  [n 3]2023   
3           Germany    4,591,100       2024      4,456,081       2023   
4             Japan    4,110,452       2024      4,212,945       2023   

  United Nations

In [65]:
df = tables[2]
df.head()

Unnamed: 0_level_0,Country/Territory,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,109529216,2024,105435540,2023,100834796,2022
1,United States,28781083,2024,27360935,2023,25744100,2022
2,China,18532633,[n 1]2024,17794782,[n 3]2023,17963170,[n 1]2022
3,Germany,4591100,2024,4456081,2023,4076923,2022
4,Japan,4110452,2024,4212945,2023,4232173,2022


In [66]:
print(df.shape)
print(df.dtypes)

(210, 7)
Country/Territory   Country/Territory    object
IMF[1][13]          Forecast             object
                    Year                 object
World Bank[14]      Estimate             object
                    Year                 object
United Nations[15]  Estimate             object
                    Year                 object
dtype: object


In [67]:
df = df.replace(r'\[.*?\]', '', regex=True)
df.head(10)

Unnamed: 0_level_0,Country/Territory,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,109529216,2024,105435540,2023,100834796,2022
1,United States,28781083,2024,27360935,2023,25744100,2022
2,China,18532633,2024,17794782,2023,17963170,2022
3,Germany,4591100,2024,4456081,2023,4076923,2022
4,Japan,4110452,2024,4212945,2023,4232173,2022
5,India,3937011,2024,3549919,2023,3465541,2022
6,United Kingdom,3495261,2024,3340032,2023,3089072,2022
7,France,3130014,2024,3030904,2023,2775316,2022
8,Brazil,2331391,2024,2173666,2023,1920095,2022
9,Italy,2328028,2024,2254851,2023,2046952,2022


In [68]:
df = df.drop(df.columns[[2,4,6]], axis=1)
df.head()

Unnamed: 0_level_0,Country/Territory,IMF[1][13],World Bank[14],United Nations[15]
Unnamed: 0_level_1,Country/Territory,Forecast,Estimate,Estimate
0,World,109529216,105435540,100834796
1,United States,28781083,27360935,25744100
2,China,18532633,17794782,17963170
3,Germany,4591100,4456081,4076923
4,Japan,4110452,4212945,4232173


In [69]:
columns = ['country', 'imf_2024', 'world_bank_2023', 'united_nations_2022']
df.columns = columns
df.head()

Unnamed: 0,country,imf_2024,world_bank_2023,united_nations_2022
0,World,109529216,105435540,100834796
1,United States,28781083,27360935,25744100
2,China,18532633,17794782,17963170
3,Germany,4591100,4456081,4076923
4,Japan,4110452,4212945,4232173


In [70]:
# df = df.apply(lambda x: x.str.replace(',', '').astype(float) if x.dtype == 'object' and x.str.contains(r'^\d{1,3}(,\d{3})*$').any() else x)
# df['imf_2024'] = df['imf_2024'].str.replace(',', '').astype(float)
# df['world_bank_2023'] = df['world_bank_2023'].str.replace(',', '').astype(float)
# df['united_nations_2022'] = df['united_nations_2022'].str.replace(',', '').astype(float)
def clean_and_convert(value):
    if isinstance(value, str):
        try:
            return float(value.replace(',', ''))
        except ValueError:
            return value  # если не удалось преобразовать, возвращаем исходное значение
    return value

# Применение функции к каждому элементу DataFrame
df = df.applymap(clean_and_convert)

print(df.dtypes)
df.head()

country                object
imf_2024               object
world_bank_2023        object
united_nations_2022    object
dtype: object


  df = df.applymap(clean_and_convert)


Unnamed: 0,country,imf_2024,world_bank_2023,united_nations_2022
0,World,109529216.0,105435540.0,100834796.0
1,United States,28781083.0,27360935.0,25744100.0
2,China,18532633.0,17794782.0,17963170.0
3,Germany,4591100.0,4456081.0,4076923.0
4,Japan,4110452.0,4212945.0,4232173.0


In [75]:
df = df.replace('—', np.nan)

  df = df.replace('—', np.nan)


In [76]:
print(df.isnull().sum())
print(df.dtypes)

country                 0
imf_2024               15
world_bank_2023         8
united_nations_2022     1
dtype: int64
country                 object
imf_2024               float64
world_bank_2023        float64
united_nations_2022    float64
dtype: object


In [77]:
df.loc[df['country'] == 'Taiwan']

Unnamed: 0,country,imf_2024,world_bank_2023,united_nations_2022
22,Taiwan,802.958,,


In [53]:
missing_values = df[df['united_nations_2022'].isnull()]

print(missing_values)

Empty DataFrame
Columns: [country, imf_2024, world_bank_2023, united_nations_2022]
Index: []


In [78]:
df.imf_2024 = df.imf_2024.fillna(df.imf_2024.mean())
df.world_bank_2023 = df.world_bank_2023.fillna(df.world_bank_2023.mean())
df.united_nations_2022 = df.united_nations_2022.fillna(df.united_nations_2022.mean())
print(df.isnull().sum())

country                0
imf_2024               0
world_bank_2023        0
united_nations_2022    0
dtype: int64


# Building a model and choosing the best one