# Анализ данных мирового отчёта о счастье с применением методов машинного обучения

### Импорт нужных библиотек и модулей

Поскольку проект выполняется в Google Colab, необходимо установить CatBoost

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Импортируем необходимые модули и библиотеки

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot

from sklearn.model_selection import train_test_split 

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from xgboost import XGBRegressor
from catboost import CatBoostRegressor



В следующей ячейке уберём предупреждения

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Загрузка и изучение данных

Данные взяты с сайта [Kaggle](https://www.kaggle.com/datasets/mathurinache/world-happiness-report-2022).

---

Для начала загрузим их и сохраним в соответсвующие переменные



In [None]:
df22 = pd.read_csv('2022.csv')
df21 = pd.read_csv('2021.csv')
df20 = pd.read_csv('2020.csv')
df19 = pd.read_csv('2019.csv')
df18 = pd.read_csv('2018.csv')
df17 = pd.read_csv('2017.csv')
df16 = pd.read_csv('2016.csv')
df15 = pd.read_csv('2015.csv')

Составим словарь, в котором значениями будут датасеты, а ключами соотвествующие им года

In [None]:
dict_df = {
    '2015': df15,
    '2016': df16,
    '2017': df17,
    '2018': df18,
    '2019': df19,
    '2020': df20,
    '2021': df21,
    '2022': df22
}

In [None]:
df15.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [None]:
df15['Country'].unique()

array(['Switzerland', 'Iceland', 'Denmark', 'Norway', 'Canada', 'Finland',
       'Netherlands', 'Sweden', 'New Zealand', 'Australia', 'Israel',
       'Costa Rica', 'Austria', 'Mexico', 'United States', 'Brazil',
       'Luxembourg', 'Ireland', 'Belgium', 'United Arab Emirates',
       'United Kingdom', 'Oman', 'Venezuela', 'Singapore', 'Panama',
       'Germany', 'Chile', 'Qatar', 'France', 'Argentina',
       'Czech Republic', 'Uruguay', 'Colombia', 'Thailand',
       'Saudi Arabia', 'Spain', 'Malta', 'Taiwan', 'Kuwait', 'Suriname',
       'Trinidad and Tobago', 'El Salvador', 'Guatemala', 'Uzbekistan',
       'Slovakia', 'Japan', 'South Korea', 'Ecuador', 'Bahrain', 'Italy',
       'Bolivia', 'Moldova', 'Paraguay', 'Kazakhstan', 'Slovenia',
       'Lithuania', 'Nicaragua', 'Peru', 'Belarus', 'Poland', 'Malaysia',
       'Croatia', 'Libya', 'Russia', 'Jamaica', 'North Cyprus', 'Cyprus',
       'Algeria', 'Kosovo', 'Turkmenistan', 'Mauritius', 'Hong Kong',
       'Estonia', 'Indonesi

In [None]:
df15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

In [None]:
df16.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


In [None]:
df16.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      157 non-null    float64
 5   Upper Confidence Interval      157 non-null    float64
 6   Economy (GDP per Capita)       157 non-null    float64
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       157 non-null    float64
 9   Freedom                        157 non-null    float64
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    float64
 12  Dystopia Residual              157 non-null    flo

In [None]:
df16['Country'].unique()

array(['Denmark', 'Switzerland', 'Iceland', 'Norway', 'Finland', 'Canada',
       'Netherlands', 'New Zealand', 'Australia', 'Sweden', 'Israel',
       'Austria', 'United States', 'Costa Rica', 'Puerto Rico', 'Germany',
       'Brazil', 'Belgium', 'Ireland', 'Luxembourg', 'Mexico',
       'Singapore', 'United Kingdom', 'Chile', 'Panama', 'Argentina',
       'Czech Republic', 'United Arab Emirates', 'Uruguay', 'Malta',
       'Colombia', 'France', 'Thailand', 'Saudi Arabia', 'Taiwan',
       'Qatar', 'Spain', 'Algeria', 'Guatemala', 'Suriname', 'Kuwait',
       'Bahrain', 'Trinidad and Tobago', 'Venezuela', 'Slovakia',
       'El Salvador', 'Malaysia', 'Nicaragua', 'Uzbekistan', 'Italy',
       'Ecuador', 'Belize', 'Japan', 'Kazakhstan', 'Moldova', 'Russia',
       'Poland', 'South Korea', 'Bolivia', 'Lithuania', 'Belarus',
       'North Cyprus', 'Slovenia', 'Peru', 'Turkmenistan', 'Mauritius',
       'Libya', 'Latvia', 'Cyprus', 'Paraguay', 'Romania', 'Estonia',
       'Jamaica', 'Croa

In [None]:
df17.head()

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [None]:
df17.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 

In [None]:
df17['Country'].unique()

array(['Norway', 'Denmark', 'Iceland', 'Switzerland', 'Finland',
       'Netherlands', 'Canada', 'New Zealand', 'Sweden', 'Australia',
       'Israel', 'Costa Rica', 'Austria', 'United States', 'Ireland',
       'Germany', 'Belgium', 'Luxembourg', 'United Kingdom', 'Chile',
       'United Arab Emirates', 'Brazil', 'Czech Republic', 'Argentina',
       'Mexico', 'Singapore', 'Malta', 'Uruguay', 'Guatemala', 'Panama',
       'France', 'Thailand', 'Taiwan Province of China', 'Spain', 'Qatar',
       'Colombia', 'Saudi Arabia', 'Trinidad and Tobago', 'Kuwait',
       'Slovakia', 'Bahrain', 'Malaysia', 'Nicaragua', 'Ecuador',
       'El Salvador', 'Poland', 'Uzbekistan', 'Italy', 'Russia', 'Belize',
       'Japan', 'Lithuania', 'Algeria', 'Latvia', 'South Korea',
       'Moldova', 'Romania', 'Bolivia', 'Turkmenistan', 'Kazakhstan',
       'North Cyprus', 'Slovenia', 'Peru', 'Mauritius', 'Cyprus',
       'Estonia', 'Belarus', 'Libya', 'Turkey', 'Paraguay',
       'Hong Kong S.A.R., China', '

In [None]:
df18.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


In [None]:
df18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     155 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [None]:
df18['Country or region'].unique()

array(['Finland', 'Norway', 'Denmark', 'Iceland', 'Switzerland',
       'Netherlands', 'Canada', 'New Zealand', 'Sweden', 'Australia',
       'United Kingdom', 'Austria', 'Costa Rica', 'Ireland', 'Germany',
       'Belgium', 'Luxembourg', 'United States', 'Israel',
       'United Arab Emirates', 'Czech Republic', 'Malta', 'France',
       'Mexico', 'Chile', 'Taiwan', 'Panama', 'Brazil', 'Argentina',
       'Guatemala', 'Uruguay', 'Qatar', 'Saudi Arabia', 'Singapore',
       'Malaysia', 'Spain', 'Colombia', 'Trinidad & Tobago', 'Slovakia',
       'El Salvador', 'Nicaragua', 'Poland', 'Bahrain', 'Uzbekistan',
       'Kuwait', 'Thailand', 'Italy', 'Ecuador', 'Belize', 'Lithuania',
       'Slovenia', 'Romania', 'Latvia', 'Japan', 'Mauritius', 'Jamaica',
       'South Korea', 'Northern Cyprus', 'Russia', 'Kazakhstan', 'Cyprus',
       'Bolivia', 'Estonia', 'Paraguay', 'Peru', 'Kosovo', 'Moldova',
       'Turkmenistan', 'Hungary', 'Libya', 'Philippines', 'Honduras',
       'Belarus', 'Turkey

In [None]:
df19.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [None]:
df19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [None]:
df19['Country or region'].unique()

array(['Finland', 'Denmark', 'Norway', 'Iceland', 'Netherlands',
       'Switzerland', 'Sweden', 'New Zealand', 'Canada', 'Austria',
       'Australia', 'Costa Rica', 'Israel', 'Luxembourg',
       'United Kingdom', 'Ireland', 'Germany', 'Belgium', 'United States',
       'Czech Republic', 'United Arab Emirates', 'Malta', 'Mexico',
       'France', 'Taiwan', 'Chile', 'Guatemala', 'Saudi Arabia', 'Qatar',
       'Spain', 'Panama', 'Brazil', 'Uruguay', 'Singapore', 'El Salvador',
       'Italy', 'Bahrain', 'Slovakia', 'Trinidad & Tobago', 'Poland',
       'Uzbekistan', 'Lithuania', 'Colombia', 'Slovenia', 'Nicaragua',
       'Kosovo', 'Argentina', 'Romania', 'Cyprus', 'Ecuador', 'Kuwait',
       'Thailand', 'Latvia', 'South Korea', 'Estonia', 'Jamaica',
       'Mauritius', 'Japan', 'Honduras', 'Kazakhstan', 'Bolivia',
       'Hungary', 'Paraguay', 'Northern Cyprus', 'Peru', 'Portugal',
       'Pakistan', 'Russia', 'Philippines', 'Serbia', 'Moldova', 'Libya',
       'Montenegro', 'Tajikis

In [None]:
df20.head()

Unnamed: 0,Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,Western Europe,7.8087,0.031156,7.869766,7.747634,10.639267,0.95433,71.900825,0.949172,-0.059482,0.195445,1.972317,1.28519,1.499526,0.961271,0.662317,0.15967,0.477857,2.762835
1,Denmark,Western Europe,7.6456,0.033492,7.711245,7.579955,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489,1.972317,1.326949,1.503449,0.979333,0.66504,0.242793,0.49526,2.432741
2,Switzerland,Western Europe,7.5599,0.035014,7.628528,7.491272,10.979933,0.942847,74.102448,0.921337,0.105911,0.303728,1.972317,1.390774,1.472403,1.040533,0.628954,0.269056,0.407946,2.350267
3,Iceland,Western Europe,7.5045,0.059616,7.621347,7.387653,10.772559,0.97467,73.0,0.948892,0.246944,0.71171,1.972317,1.326502,1.547567,1.000843,0.661981,0.36233,0.144541,2.460688
4,Norway,Western Europe,7.488,0.034837,7.556281,7.419719,11.087804,0.952487,73.200783,0.95575,0.134533,0.263218,1.972317,1.424207,1.495173,1.008072,0.670201,0.287985,0.434101,2.168266


In [None]:
df20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                153 non-null    object 
 1   Regional indicator                          153 non-null    object 
 2   Ladder score                                153 non-null    float64
 3   Standard error of ladder score              153 non-null    float64
 4   upperwhisker                                153 non-null    float64
 5   lowerwhisker                                153 non-null    float64
 6   Logged GDP per capita                       153 non-null    float64
 7   Social support                              153 non-null    float64
 8   Healthy life expectancy                     153 non-null    float64
 9   Freedom to make life choices                153 non-null    float64
 10  Generosity    

In [None]:
df20['Country name'].unique()

array(['Finland', 'Denmark', 'Switzerland', 'Iceland', 'Norway',
       'Netherlands', 'Sweden', 'New Zealand', 'Austria', 'Luxembourg',
       'Canada', 'Australia', 'United Kingdom', 'Israel', 'Costa Rica',
       'Ireland', 'Germany', 'United States', 'Czech Republic', 'Belgium',
       'United Arab Emirates', 'Malta', 'France', 'Mexico',
       'Taiwan Province of China', 'Uruguay', 'Saudi Arabia', 'Spain',
       'Guatemala', 'Italy', 'Singapore', 'Brazil', 'Slovenia',
       'El Salvador', 'Kosovo', 'Panama', 'Slovakia', 'Uzbekistan',
       'Chile', 'Bahrain', 'Lithuania', 'Trinidad and Tobago', 'Poland',
       'Colombia', 'Cyprus', 'Nicaragua', 'Romania', 'Kuwait',
       'Mauritius', 'Kazakhstan', 'Estonia', 'Philippines', 'Hungary',
       'Thailand', 'Argentina', 'Honduras', 'Latvia', 'Ecuador',
       'Portugal', 'Jamaica', 'South Korea', 'Japan', 'Peru', 'Serbia',
       'Bolivia', 'Pakistan', 'Paraguay', 'Dominican Republic',
       'Bosnia and Herzegovina', 'Moldova', '

In [None]:
df21.head()

Unnamed: 0,Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,Western Europe,7.842,0.032,7.904,7.78,10.775,0.954,72.0,0.949,-0.098,0.186,2.43,1.446,1.106,0.741,0.691,0.124,0.481,3.253
1,Denmark,Western Europe,7.62,0.035,7.687,7.552,10.933,0.954,72.7,0.946,0.03,0.179,2.43,1.502,1.108,0.763,0.686,0.208,0.485,2.868
2,Switzerland,Western Europe,7.571,0.036,7.643,7.5,11.117,0.942,74.4,0.919,0.025,0.292,2.43,1.566,1.079,0.816,0.653,0.204,0.413,2.839
3,Iceland,Western Europe,7.554,0.059,7.67,7.438,10.878,0.983,73.0,0.955,0.16,0.673,2.43,1.482,1.172,0.772,0.698,0.293,0.17,2.967
4,Netherlands,Western Europe,7.464,0.027,7.518,7.41,10.932,0.942,72.4,0.913,0.175,0.338,2.43,1.501,1.079,0.753,0.647,0.302,0.384,2.798


In [None]:
df21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                149 non-null    object 
 1   Regional indicator                          149 non-null    object 
 2   Ladder score                                149 non-null    float64
 3   Standard error of ladder score              149 non-null    float64
 4   upperwhisker                                149 non-null    float64
 5   lowerwhisker                                149 non-null    float64
 6   Logged GDP per capita                       149 non-null    float64
 7   Social support                              149 non-null    float64
 8   Healthy life expectancy                     149 non-null    float64
 9   Freedom to make life choices                149 non-null    float64
 10  Generosity    

In [None]:
df21['Country name'].unique()

array(['Finland', 'Denmark', 'Switzerland', 'Iceland', 'Netherlands',
       'Norway', 'Sweden', 'Luxembourg', 'New Zealand', 'Austria',
       'Australia', 'Israel', 'Germany', 'Canada', 'Ireland',
       'Costa Rica', 'United Kingdom', 'Czech Republic', 'United States',
       'Belgium', 'France', 'Bahrain', 'Malta',
       'Taiwan Province of China', 'United Arab Emirates', 'Saudi Arabia',
       'Spain', 'Italy', 'Slovenia', 'Guatemala', 'Uruguay', 'Singapore',
       'Kosovo', 'Slovakia', 'Brazil', 'Mexico', 'Jamaica', 'Lithuania',
       'Cyprus', 'Estonia', 'Panama', 'Uzbekistan', 'Chile', 'Poland',
       'Kazakhstan', 'Romania', 'Kuwait', 'Serbia', 'El Salvador',
       'Mauritius', 'Latvia', 'Colombia', 'Hungary', 'Thailand',
       'Nicaragua', 'Japan', 'Argentina', 'Portugal', 'Honduras',
       'Croatia', 'Philippines', 'South Korea', 'Peru',
       'Bosnia and Herzegovina', 'Moldova', 'Ecuador', 'Kyrgyzstan',
       'Greece', 'Bolivia', 'Mongolia', 'Paraguay', 'Montenegro

In [None]:
df22.head()

Unnamed: 0,RANK,Country,Happiness score,Whisker-high,Whisker-low,Dystopia (1.83) + residual,Explained by: GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption
0,1,Finland,7821,7886,7756,2518,1892,1258,775,736,109,534
1,2,Denmark,7636,7710,7563,2226,1953,1243,777,719,188,532
2,3,Iceland,7557,7651,7464,2320,1936,1320,803,718,270,191
3,4,Switzerland,7512,7586,7437,2153,2026,1226,822,677,147,461
4,5,Netherlands,7415,7471,7359,2137,1945,1206,787,651,271,419


In [None]:
df22.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   RANK                                        147 non-null    int64 
 1   Country                                     147 non-null    object
 2   Happiness score                             146 non-null    object
 3   Whisker-high                                146 non-null    object
 4   Whisker-low                                 146 non-null    object
 5   Dystopia (1.83) + residual                  146 non-null    object
 6   Explained by: GDP per capita                146 non-null    object
 7   Explained by: Social support                146 non-null    object
 8   Explained by: Healthy life expectancy       146 non-null    object
 9   Explained by: Freedom to make life choices  146 non-null    object
 10  Explained by: Generosity  

In [None]:
df22['Country'].unique()

array(['Finland', 'Denmark', 'Iceland', 'Switzerland', 'Netherlands',
       'Luxembourg*', 'Sweden', 'Norway', 'Israel', 'New Zealand',
       'Austria', 'Australia', 'Ireland', 'Germany', 'Canada',
       'United States', 'United Kingdom', 'Czechia', 'Belgium', 'France',
       'Bahrain', 'Slovenia', 'Costa Rica', 'United Arab Emirates',
       'Saudi Arabia', 'Taiwan Province of China', 'Singapore', 'Romania',
       'Spain', 'Uruguay', 'Italy', 'Kosovo', 'Malta', 'Lithuania',
       'Slovakia', 'Estonia', 'Panama', 'Brazil', 'Guatemala*',
       'Kazakhstan', 'Cyprus', 'Latvia', 'Serbia', 'Chile', 'Nicaragua',
       'Mexico', 'Croatia', 'Poland', 'El Salvador', 'Kuwait*', 'Hungary',
       'Mauritius', 'Uzbekistan', 'Japan', 'Honduras', 'Portugal',
       'Argentina', 'Greece', 'South Korea', 'Philippines', 'Thailand',
       'Moldova', 'Jamaica', 'Kyrgyzstan', 'Belarus*', 'Colombia',
       'Bosnia and Herzegovina', 'Mongolia', 'Dominican Republic',
       'Malaysia', 'Bolivia', 

Выведем на экран размеры таблиц.

item — это кортеж (ключ, значение)

In [None]:
print("Размеры таблиц:\n")
for item in dict_df.items():
  print(item[0], ":", item[1].shape)

Размеры таблиц:

2015 : (158, 12)
2016 : (157, 13)
2017 : (155, 12)
2018 : (156, 9)
2019 : (156, 9)
2020 : (153, 20)
2021 : (149, 20)
2022 : (147, 12)


Видно, что датасеты за разные года разных размеров - в них разное количество столбцов (признаков). Посмотрим на все колонки и отберём только те, которые встречаются везде

In [None]:
def desplay_all_columns(dict_df, n):
  data = {}
  for year, df in dict_df.items():
    value = [0] * n
          
    for i, x in enumerate(df.columns):
      value[i] = x
    data[year] = value
          
  display(pd.DataFrame(data))

In [None]:
desplay_all_columns(dict_df, 20)

Unnamed: 0,2015,2016,2017,2018,2019,2020,2021,2022
0,Country,Country,Country,Overall rank,Overall rank,Country name,Country name,RANK
1,Region,Region,Happiness.Rank,Country or region,Country or region,Regional indicator,Regional indicator,Country
2,Happiness Rank,Happiness Rank,Happiness.Score,Score,Score,Ladder score,Ladder score,Happiness score
3,Happiness Score,Happiness Score,Whisker.high,GDP per capita,GDP per capita,Standard error of ladder score,Standard error of ladder score,Whisker-high
4,Standard Error,Lower Confidence Interval,Whisker.low,Social support,Social support,upperwhisker,upperwhisker,Whisker-low
5,Economy (GDP per Capita),Upper Confidence Interval,Economy..GDP.per.Capita.,Healthy life expectancy,Healthy life expectancy,lowerwhisker,lowerwhisker,Dystopia (1.83) + residual
6,Family,Economy (GDP per Capita),Family,Freedom to make life choices,Freedom to make life choices,Logged GDP per capita,Logged GDP per capita,Explained by: GDP per capita
7,Health (Life Expectancy),Family,Health..Life.Expectancy.,Generosity,Generosity,Social support,Social support,Explained by: Social support
8,Freedom,Health (Life Expectancy),Freedom,Perceptions of corruption,Perceptions of corruption,Healthy life expectancy,Healthy life expectancy,Explained by: Healthy life expectancy
9,Trust (Government Corruption),Freedom,Generosity,0,0,Freedom to make life choices,Freedom to make life choices,Explained by: Freedom to make life choices


Оставим в датафреймах только те столбцы, которые встречаются в опросах каждого года:
*   Рейтинг (Score)
*   Страна (Country)
*   ВВП (GDP)
*   Социальная поддержка (Support)
*   Здоровье (Health)
*   Свобода (Freedom)
*   Щедрость (Generosity)
*   Уровень коррупции (Corruption)



## EDA

Для начала во всех датафреймах переименуем важные для нас столбцы. А затем удалим все оставшиеся

### Переименование столбцов

In [None]:
df15 = df15.rename(
    columns = {
        'Happiness Score' : 'score',
        'Country': 'country',
        'Economy (GDP per Capita)': 'GDP',
        'Family': 'support',
        'Health (Life Expectancy)': 'health',
        'Freedom': 'freedom',
        'Generosity': 'generosity',
        'Trust (Government Corruption)': 'corruption'
    }
)

In [None]:
df16 = df16.rename(
    columns = {
        'Happiness Score' : 'score',
        'Country': 'country',
        'Economy (GDP per Capita)': 'GDP',
        'Family': 'support',
        'Health (Life Expectancy)': 'health',
        'Freedom': 'freedom',
        'Generosity': 'generosity',
        'Trust (Government Corruption)': 'corruption'
    }
)

In [None]:
df17 = df17.rename(
    columns = {
        'Happiness.Score' : 'score',
        'Country': 'country',
        'Economy..GDP.per.Capita.': 'GDP',
        'Family': 'support',
        'Health..Life.Expectancy.': 'health',
        'Freedom': 'freedom',
        'Generosity': 'generosity',
        'Trust..Government.Corruption.': 'corruption'
    }
)

In [None]:
df18 = df18.rename(
    columns = {
        'Score' : 'score',
        'Country or region': 'country',
        'GDP per capita': 'GDP',
        'Social support': 'support',
        'Healthy life expectancy': 'health',
        'Freedom to make life choices': 'freedom',
        'Generosity': 'generosity',
        'Perceptions of corruption': 'corruption'
    }
)

In [None]:
df19 = df19.rename(
    columns = {
        'Score' : 'score',
        'Country or region': 'country',
        'GDP per capita': 'GDP',
        'Social support': 'support',
        'Healthy life expectancy': 'health',
        'Freedom to make life choices': 'freedom',
        'Generosity': 'generosity',
        'Perceptions of corruption': 'corruption'
    }
)

In [None]:
df20 = df20.rename(
    columns = {
        'Ladder score' : 'score',
        'Country name': 'country',
        'Explained by: Log GDP per capita': 'GDP',
        'Explained by: Social support': 'support',
        'Explained by: Healthy life expectancy': 'health',
        'Explained by: Freedom to make life choices': 'freedom',
        'Explained by: Generosity': 'generosity',
        'Explained by: Perceptions of corruption': 'corruption'
    }
)

In [None]:
df21 = df21.rename(
    columns = {
        'Ladder score' : 'score',
        'Country name': 'country',
        'Explained by: Log GDP per capita': 'GDP',
        'Explained by: Social support': 'support',
        'Explained by: Healthy life expectancy': 'health',
        'Explained by: Freedom to make life choices': 'freedom',
        'Explained by: Generosity': 'generosity',
        'Explained by: Perceptions of corruption': 'corruption'
    }
)

In [None]:
df22 = df22.rename(
    columns = {
        'Happiness score' : 'score',
        'Country': 'country',
        'Explained by: GDP per capita': 'GDP',
        'Explained by: Social support': 'support',
        'Explained by: Healthy life expectancy': 'health',
        'Explained by: Freedom to make life choices': 'freedom',
        'Explained by: Generosity': 'generosity',
        'Explained by: Perceptions of corruption': 'corruption'
    }
)

### Удаление лишних столбцов

Составим список нужных нам столбцов и реализуем функцию удаления лишних.

In [None]:
columns_needed = ['score', 'country', 'GDP', 'support', 'health', 'freedom', 'generosity', 'corruption']
def drop_columns(df):
  for column in list(df.columns):
    if column not in columns_needed:
      df.drop(column, inplace=True, axis=1)
  return df

In [None]:
df15 = drop_columns(df15)
df15

Unnamed: 0,country,score,GDP,support,health,freedom,corruption,generosity
0,Switzerland,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,Iceland,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630
2,Denmark,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,Norway,7.522,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699
4,Canada,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811
...,...,...,...,...,...,...,...,...
153,Rwanda,3.465,0.22208,0.77370,0.42864,0.59201,0.55191,0.22628
154,Benin,3.340,0.28665,0.35386,0.31910,0.48450,0.08010,0.18260
155,Syria,3.006,0.66320,0.47489,0.72193,0.15684,0.18906,0.47179
156,Burundi,2.905,0.01530,0.41587,0.22396,0.11850,0.10062,0.19727


In [None]:
df16 = drop_columns(df16)
df16

Unnamed: 0,country,score,GDP,support,health,freedom,corruption,generosity
0,Denmark,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171
1,Switzerland,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083
2,Iceland,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678
3,Norway,7.498,1.57744,1.12690,0.79579,0.59609,0.35776,0.37895
4,Finland,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492
...,...,...,...,...,...,...,...,...
152,Benin,3.484,0.39499,0.10419,0.21028,0.39747,0.06681,0.20180
153,Afghanistan,3.360,0.38227,0.11037,0.17344,0.16430,0.07112,0.31268
154,Togo,3.303,0.28123,0.00000,0.24811,0.34678,0.11587,0.17517
155,Syria,3.069,0.74719,0.14866,0.62994,0.06912,0.17233,0.48397


In [None]:
df17 = drop_columns(df17)
df17

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption
0,Norway,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964
1,Denmark,7.522,1.482383,1.551122,0.792566,0.626007,0.355280,0.400770
2,Iceland,7.504,1.480633,1.610574,0.833552,0.627163,0.475540,0.153527
3,Switzerland,7.494,1.564980,1.516912,0.858131,0.620071,0.290549,0.367007
4,Finland,7.469,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612
...,...,...,...,...,...,...,...,...
150,Rwanda,3.471,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220
151,Syria,3.462,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347
152,Tanzania,3.349,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035
153,Burundi,2.905,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148


In [None]:
df18 = drop_columns(df18)
df18

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption
0,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.340
2,Denmark,7.555,1.351,1.590,0.868,0.683,0.284,0.408
3,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,Switzerland,7.487,1.420,1.549,0.927,0.660,0.256,0.357
...,...,...,...,...,...,...,...,...
151,Yemen,3.355,0.442,1.073,0.343,0.244,0.083,0.064
152,Tanzania,3.303,0.455,0.991,0.381,0.481,0.270,0.097
153,South Sudan,3.254,0.337,0.608,0.177,0.112,0.224,0.106
154,Central African Republic,3.083,0.024,0.000,0.010,0.305,0.218,0.038


In [None]:
df19 = drop_columns(df19)
df19

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption
0,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393
1,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410
2,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118
4,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298
...,...,...,...,...,...,...,...,...
151,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411
152,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147
153,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025
154,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035


In [None]:
df20 = drop_columns(df20)
df20

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption
0,Finland,7.8087,1.285190,1.499526,0.961271,0.662317,0.159670,0.477857
1,Denmark,7.6456,1.326949,1.503449,0.979333,0.665040,0.242793,0.495260
2,Switzerland,7.5599,1.390774,1.472403,1.040533,0.628954,0.269056,0.407946
3,Iceland,7.5045,1.326502,1.547567,1.000843,0.661981,0.362330,0.144541
4,Norway,7.4880,1.424207,1.495173,1.008072,0.670201,0.287985,0.434101
...,...,...,...,...,...,...,...,...
148,Central African Republic,3.4759,0.041072,0.000000,0.000000,0.292814,0.253513,0.028265
149,Rwanda,3.3123,0.343243,0.522876,0.572383,0.604088,0.235705,0.485542
150,Zimbabwe,3.2992,0.425564,1.047835,0.375038,0.377405,0.151349,0.080929
151,South Sudan,2.8166,0.289083,0.553279,0.208809,0.065609,0.209935,0.111157


In [None]:
df21 = drop_columns(df21)
df21

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption
0,Finland,7.842,1.446,1.106,0.741,0.691,0.124,0.481
1,Denmark,7.620,1.502,1.108,0.763,0.686,0.208,0.485
2,Switzerland,7.571,1.566,1.079,0.816,0.653,0.204,0.413
3,Iceland,7.554,1.482,1.172,0.772,0.698,0.293,0.170
4,Netherlands,7.464,1.501,1.079,0.753,0.647,0.302,0.384
...,...,...,...,...,...,...,...,...
144,Lesotho,3.512,0.451,0.731,0.007,0.405,0.103,0.015
145,Botswana,3.467,1.099,0.724,0.340,0.539,0.027,0.088
146,Rwanda,3.415,0.364,0.202,0.407,0.627,0.227,0.493
147,Zimbabwe,3.145,0.457,0.649,0.243,0.359,0.157,0.075


In [None]:
df22 = drop_columns(df22)
df22

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption
0,Finland,7821,1892,1258,0775,0736,0109,0534
1,Denmark,7636,1953,1243,0777,0719,0188,0532
2,Iceland,7557,1936,1320,0803,0718,0270,0191
3,Switzerland,7512,2026,1226,0822,0677,0147,0461
4,Netherlands,7415,1945,1206,0787,0651,0271,0419
...,...,...,...,...,...,...,...,...
142,Rwanda*,3268,0785,0133,0462,0621,0187,0544
143,Zimbabwe,2995,0947,0690,0270,0329,0106,0105
144,Lebanon,2955,1392,0498,0631,0103,0082,0034
145,Afghanistan,2404,0758,0000,0289,0000,0089,0005


Изменим наш словарь

In [None]:
dict_df = {
    '2015': df15,
    '2016': df16,
    '2017': df17,
    '2018': df18,
    '2019': df19,
    '2020': df20,
    '2021': df21,
    '2022': df22
}

Вновь выведем на экран все колонки и убедимся, что лишнего не осталось, а всё необходимое есть

In [None]:
desplay_all_columns(dict_df, 8)

Unnamed: 0,2015,2016,2017,2018,2019,2020,2021,2022
0,country,country,country,country,country,country,country,country
1,score,score,score,score,score,score,score,score
2,GDP,GDP,GDP,GDP,GDP,GDP,GDP,GDP
3,support,support,support,support,support,support,support,support
4,health,health,health,health,health,health,health,health
5,freedom,freedom,freedom,freedom,freedom,freedom,freedom,freedom
6,corruption,corruption,generosity,generosity,generosity,generosity,generosity,generosity
7,generosity,generosity,corruption,corruption,corruption,corruption,corruption,corruption


### Удаление пропусков

Посмотрим, сколько в данных пропущенных значений

In [None]:
for year, df in dict_df.items():
  print(year)
  print(df.isna().sum())
  print()

2015
country       0
score         0
GDP           0
support       0
health        0
freedom       0
corruption    0
generosity    0
dtype: int64

2016
country       0
score         0
GDP           0
support       0
health        0
freedom       0
corruption    0
generosity    0
dtype: int64

2017
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
corruption    0
dtype: int64

2018
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
corruption    1
dtype: int64

2019
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
corruption    0
dtype: int64

2020
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
corruption    0
dtype: int64

2021
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
c

Совсем мало. Поэтому пропуски можно удалить

In [None]:
for year, df in dict_df.items():
  df = df.dropna()
  print(year)
  print(df.isna().sum())
  print()

2015
country       0
score         0
GDP           0
support       0
health        0
freedom       0
corruption    0
generosity    0
dtype: int64

2016
country       0
score         0
GDP           0
support       0
health        0
freedom       0
corruption    0
generosity    0
dtype: int64

2017
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
corruption    0
dtype: int64

2018
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
corruption    0
dtype: int64

2019
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
corruption    0
dtype: int64

2020
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
corruption    0
dtype: int64

2021
country       0
score         0
GDP           0
support       0
health        0
freedom       0
generosity    0
c

### Проверка на дубликаты

Проверим, встречаются ли в данных дубликаты

In [None]:
for year, df in dict_df.items():
  print(year)
  print(df[df.duplicated()])
  print()


2015
Empty DataFrame
Columns: [country, score, GDP, support, health, freedom, corruption, generosity]
Index: []

2016
Empty DataFrame
Columns: [country, score, GDP, support, health, freedom, corruption, generosity]
Index: []

2017
Empty DataFrame
Columns: [country, score, GDP, support, health, freedom, generosity, corruption]
Index: []

2018
Empty DataFrame
Columns: [country, score, GDP, support, health, freedom, generosity, corruption]
Index: []

2019
Empty DataFrame
Columns: [country, score, GDP, support, health, freedom, generosity, corruption]
Index: []

2020
Empty DataFrame
Columns: [country, score, GDP, support, health, freedom, generosity, corruption]
Index: []

2021
Empty DataFrame
Columns: [country, score, GDP, support, health, freedom, generosity, corruption]
Index: []

2022
Empty DataFrame
Columns: [country, score, GDP, support, health, freedom, generosity, corruption]
Index: []



Дубликатов нет

### Проверка на выбросы

Посмотрим, как распределны значения, встречаются ли аномалии. Для этого построим бокс-плоты

In [None]:
fig = px.box(df15, y=["GDP", "support", "health", "freedom", "generosity", "corruption"])
fig.show()

Встречаются выбивающиеся значения, но в целом данные выглядят правдоподобно. Не будем избавляться от них

### Подготовка к визуализации данных

#### Добавление континента

Для начала добавим в датасет информацию о континенте. 

Данные взяты с [Гитхаба](https://gist.github.com/stevewithington/20a69c0b6d2ff846ea5d35e5fc47f26c#file-country-and-continent-codes-list-csv-csv)

In [None]:
cont = pd.read_csv("/content/codes-list-csv.csv")

In [None]:
cont

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0
4,Oceania,OC,American Samoa,AS,ASM,16.0
...,...,...,...,...,...,...
257,Africa,AF,"Zambia, Republic of",ZM,ZMB,894.0
258,Oceania,OC,Disputed Territory,XX,,
259,Asia,AS,Iraq-Saudi Arabia Neutral Zone,XE,,
260,Asia,AS,United Nations Neutral Zone,XD,,


Посмотрим, какие страны встречаются в таблице

In [None]:
cont['Country_Name'].unique()

array(['Afghanistan, Islamic Republic of', 'Albania, Republic of',
       'Antarctica (the territory South of 60 deg S)',
       "Algeria, People's Democratic Republic of", 'American Samoa',
       'Andorra, Principality of', 'Angola, Republic of',
       'Antigua and Barbuda', 'Azerbaijan, Republic of',
       'Argentina, Argentine Republic', 'Australia, Commonwealth of',
       'Austria, Republic of', 'Bahamas, Commonwealth of the',
       'Bahrain, Kingdom of', "Bangladesh, People's Republic of",
       'Armenia, Republic of', 'Barbados', 'Belgium, Kingdom of',
       'Bermuda', 'Bhutan, Kingdom of', 'Bolivia, Republic of',
       'Bosnia and Herzegovina', 'Botswana, Republic of',
       'Bouvet Island (Bouvetoya)', 'Brazil, Federative Republic of',
       'Belize', 'British Indian Ocean Territory (Chagos Archipelago)',
       'Solomon Islands', 'British Virgin Islands', 'Brunei Darussalam',
       'Bulgaria, Republic of', 'Myanmar, Union of',
       'Burundi, Republic of', 'Belarus

Посмотрим, какие страны встречаются у нас - например, в данных за 2015 год.

In [None]:
list(df15['country'].unique())

['Switzerland',
 'Iceland',
 'Denmark',
 'Norway',
 'Canada',
 'Finland',
 'Netherlands',
 'Sweden',
 'New Zealand',
 'Australia',
 'Israel',
 'Costa Rica',
 'Austria',
 'Mexico',
 'United States',
 'Brazil',
 'Luxembourg',
 'Ireland',
 'Belgium',
 'United Arab Emirates',
 'United Kingdom',
 'Oman',
 'Venezuela',
 'Singapore',
 'Panama',
 'Germany',
 'Chile',
 'Qatar',
 'France',
 'Argentina',
 'Czech Republic',
 'Uruguay',
 'Colombia',
 'Thailand',
 'Saudi Arabia',
 'Spain',
 'Malta',
 'Taiwan',
 'Kuwait',
 'Suriname',
 'Trinidad and Tobago',
 'El Salvador',
 'Guatemala',
 'Uzbekistan',
 'Slovakia',
 'Japan',
 'South Korea',
 'Ecuador',
 'Bahrain',
 'Italy',
 'Bolivia',
 'Moldova',
 'Paraguay',
 'Kazakhstan',
 'Slovenia',
 'Lithuania',
 'Nicaragua',
 'Peru',
 'Belarus',
 'Poland',
 'Malaysia',
 'Croatia',
 'Libya',
 'Russia',
 'Jamaica',
 'North Cyprus',
 'Cyprus',
 'Algeria',
 'Kosovo',
 'Turkmenistan',
 'Mauritius',
 'Hong Kong',
 'Estonia',
 'Indonesia',
 'Vietnam',
 'Turkey',
 'Ky

Реализуем функцию добавления континента

In [None]:
def add_continent(df):
  df['continent'] = [0] * len(df['country'])
  for index, row in df.iterrows():
    for index2, row2 in cont.iterrows():
      if row['country'] in row2['Country_Name']:
        df['continent'][index] = cont['Continent_Name'][index2]
  return df

##### 2015 год

In [None]:
df15 = add_continent(df15)
df15

Unnamed: 0,country,score,GDP,support,health,freedom,corruption,generosity,continent
0,Switzerland,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,Europe
1,Iceland,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,Europe
2,Denmark,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,Europe
3,Norway,7.522,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,Europe
4,Canada,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,North America
...,...,...,...,...,...,...,...,...,...
153,Rwanda,3.465,0.22208,0.77370,0.42864,0.59201,0.55191,0.22628,Africa
154,Benin,3.340,0.28665,0.35386,0.31910,0.48450,0.08010,0.18260,Africa
155,Syria,3.006,0.66320,0.47489,0.72193,0.15684,0.18906,0.47179,Asia
156,Burundi,2.905,0.01530,0.41587,0.22396,0.11850,0.10062,0.19727,Africa


Посмотрим, какие значения стран остались.

In [None]:
df15['country'][df15['continent'] == 0]

46                 South Korea
65                North Cyprus
68                      Kosovo
76                  Kyrgyzstan
90           Somaliland region
98                        Laos
107    Palestinian Territories
119           Congo (Kinshasa)
138        Congo (Brazzaville)
150                Ivory Coast
Name: country, dtype: object

Незаполненных данных осталолсь не так много. Заполним их вручную. Но для начала посмотрим, какие континеты вообще используются в таблице

In [None]:
df15['continent'].unique()

array(['Europe', 'North America', 'Oceania', 'Asia', 'South America', 0,
       'Africa'], dtype=object)

In [None]:
cont['Continent_Name'].unique()

array(['Asia', 'Europe', 'Antarctica', 'Africa', 'Oceania',
       'North America', 'South America'], dtype=object)

Остальные значения заполним самостоятельно, исходя из общедоступной информации.
Скорее всего, пропуски в этих же местах будут и в остальных датафреймах, поэтому реализуем функцию для их заполнения

In [None]:
def fill_continent(df):
  df['continent'][df['country'] == 'South Korea'] = 'Asia'
  df['continent'][df['country'] == 'North Cyprus'] = 'Asia'
  df['continent'][df['country'] == 'Kosovo'] = 'Europe'
  df['continent'][df['country'] == 'Kyrgyzstan'] = 'Asia'
  df['continent'][df['country'] == 'Somaliland region'] = 'Africa'
  df['continent'][df['country'] == 'Laos'] = 'Asia'
  df['continent'][df['country'] == 'Palestinian Territories'] = 'Asia'
  df['continent'][df['country'] == 'Congo (Kinshasa)'] = 'Africa'
  df['continent'][df['country'] == 'Congo (Brazzaville)'] = 'Africa'
  df['continent'][df['country'] == 'Ivory Coast'] = 'Africa'
  return df

In [None]:
df15 = fill_continent(df15)

Проверим, что все данные заполнены и пропусков не осталось

In [None]:
df15['country'][df15['continent'] == 0]

Series([], Name: country, dtype: object)

##### 2016 год

In [None]:
df16 = add_continent(df16)
df16

Unnamed: 0,country,score,GDP,support,health,freedom,corruption,generosity,continent
0,Denmark,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,Europe
1,Switzerland,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,Europe
2,Iceland,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,Europe
3,Norway,7.498,1.57744,1.12690,0.79579,0.59609,0.35776,0.37895,Europe
4,Finland,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,Europe
...,...,...,...,...,...,...,...,...,...
152,Benin,3.484,0.39499,0.10419,0.21028,0.39747,0.06681,0.20180,Africa
153,Afghanistan,3.360,0.38227,0.11037,0.17344,0.16430,0.07112,0.31268,Asia
154,Togo,3.303,0.28123,0.00000,0.24811,0.34678,0.11587,0.17517,Africa
155,Syria,3.069,0.74719,0.14866,0.62994,0.06912,0.17233,0.48397,Asia


Посмотрим на пропуски

In [None]:
df16['country'][df16['continent'] == 0]

57                 South Korea
61                North Cyprus
76                      Kosovo
84                  Kyrgyzstan
96           Somaliland Region
101                       Laos
107    Palestinian Territories
124           Congo (Kinshasa)
126        Congo (Brazzaville)
138                Ivory Coast
Name: country, dtype: object

Для начала применим функцию, которую мы реализовали ранее для заполнения остатков

In [None]:
df16 = fill_continent(df16)

Посмотрим, что осталось дозаполнить

In [None]:
df16['country'][df16['continent'] == 0]

96    Somaliland Region
Name: country, dtype: object

Заполним вручную

In [None]:
df16['continent'][df16['country'] == 'Somaliland Region'] = 'Africa'

Проверим, что пропусков не осталось

In [None]:
df16['country'][df16['continent'] == 0]

Series([], Name: country, dtype: object)

##### 2017 год

In [None]:
df17 = add_continent(df17)
df17

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption,continent
0,Norway,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,Europe
1,Denmark,7.522,1.482383,1.551122,0.792566,0.626007,0.355280,0.400770,Europe
2,Iceland,7.504,1.480633,1.610574,0.833552,0.627163,0.475540,0.153527,Europe
3,Switzerland,7.494,1.564980,1.516912,0.858131,0.620071,0.290549,0.367007,Europe
4,Finland,7.469,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,Europe
...,...,...,...,...,...,...,...,...,...
150,Rwanda,3.471,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220,Africa
151,Syria,3.462,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,Asia
152,Tanzania,3.349,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035,Africa
153,Burundi,2.905,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148,Africa


Посмотрим, какие пропуски остались

In [None]:
df17['country'][df17['continent'] == 0]

32     Taiwan Province of China
54                  South Korea
60                 North Cyprus
70      Hong Kong S.A.R., China
77                       Kosovo
97                   Kyrgyzstan
102     Palestinian Territories
123         Congo (Brazzaville)
125            Congo (Kinshasa)
127                 Ivory Coast
Name: country, dtype: object

Попробуем для начала применить функцию для заполнения остатков

In [None]:
df17 = fill_continent(df17)

Проверим, что осталось

In [None]:
df17['country'][df17['continent'] == 0]

32    Taiwan Province of China
70     Hong Kong S.A.R., China
Name: country, dtype: object

Дозаполним данные вручную

In [None]:
df17['continent'][df17['country'] == 'Taiwan Province of China'] = 'Asia'

In [None]:
df17['continent'][df17['country'] == 'Hong Kong S.A.R., China'] = 'Asia'

Проверим, остались ли пропуски

In [None]:
df17['country'][df17['continent'] == 0]

Series([], Name: country, dtype: object)

##### 2018 год

In [None]:
df18 = add_continent(df18)
df18

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption,continent
0,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393,Europe
1,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.340,Europe
2,Denmark,7.555,1.351,1.590,0.868,0.683,0.284,0.408,Europe
3,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138,Europe
4,Switzerland,7.487,1.420,1.549,0.927,0.660,0.256,0.357,Europe
...,...,...,...,...,...,...,...,...,...
151,Yemen,3.355,0.442,1.073,0.343,0.244,0.083,0.064,Asia
152,Tanzania,3.303,0.455,0.991,0.381,0.481,0.270,0.097,Africa
153,South Sudan,3.254,0.337,0.608,0.177,0.112,0.224,0.106,Africa
154,Central African Republic,3.083,0.024,0.000,0.010,0.305,0.218,0.038,Africa


Посмотрим, что ещё надо дозаполнить

In [None]:
df18['country'][df18['continent'] == 0]

37           Trinidad & Tobago
56                 South Korea
57             Northern Cyprus
65                      Kosovo
91                  Kyrgyzstan
103    Palestinian Territories
106                Ivory Coast
109                       Laos
113        Congo (Brazzaville)
131           Congo (Kinshasa)
Name: country, dtype: object

Применим нашу функцию заполнения остатков

In [None]:
df18 = fill_continent(df18)

Проверим, что ещё осталось дозаполнить вручную

In [None]:
df18['country'][df18['continent'] == 0]

37    Trinidad & Tobago
57      Northern Cyprus
Name: country, dtype: object

Дозаполним

In [None]:
df18['continent'][df18['country'] == 'Trinidad & Tobago'] = 'North America'

In [None]:
df18['continent'][df18['country'] == 'Northern Cyprus'] = 'Asia'

Убедимся, что пропусков не осталось

In [None]:
df18['country'][df18['continent'] == 0]

Series([], Name: country, dtype: object)

##### 2019 год

In [None]:
df19 = add_continent(df19)
df19

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption,continent
0,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393,Europe
1,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410,Europe
2,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,Europe
3,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118,Europe
4,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,North America
...,...,...,...,...,...,...,...,...,...
151,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411,Africa
152,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147,Africa
153,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025,Asia
154,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035,Africa


Посмотрим, какие строчки остались без континента

In [None]:
df19['country'][df19['continent'] == 0]

38           Trinidad & Tobago
45                      Kosovo
53                 South Korea
63             Northern Cyprus
83             North Macedonia
85                  Kyrgyzstan
98                 Ivory Coast
102        Congo (Brazzaville)
104                       Laos
109    Palestinian Territories
126           Congo (Kinshasa)
Name: country, dtype: object

Применим нашу функцию для дозаполнения остатков

In [None]:
df19 = fill_continent(df19)

Посмотрим, что осталось ещё дозаполнить вручную

In [None]:
df19['country'][df19['continent'] == 0]

38    Trinidad & Tobago
63      Northern Cyprus
83      North Macedonia
Name: country, dtype: object

Дозаполним

In [None]:
df19['continent'][df19['country'] == 'Trinidad & Tobago'] = 'North America'

In [None]:
df19['continent'][df19['country'] == 'Northern Cyprus'] = 'Asia'

In [None]:
df19['continent'][df19['country'] == 'North Macedonia'] = 'Europe'

Убедимся, что пропусков не осталось

In [None]:
df19['country'][df19['continent'] == 0]

Series([], Name: country, dtype: object)

##### 2020 год

In [None]:
df20 = add_continent(df20)
df20

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption,continent
0,Finland,7.8087,1.285190,1.499526,0.961271,0.662317,0.159670,0.477857,Europe
1,Denmark,7.6456,1.326949,1.503449,0.979333,0.665040,0.242793,0.495260,Europe
2,Switzerland,7.5599,1.390774,1.472403,1.040533,0.628954,0.269056,0.407946,Europe
3,Iceland,7.5045,1.326502,1.547567,1.000843,0.661981,0.362330,0.144541,Europe
4,Norway,7.4880,1.424207,1.495173,1.008072,0.670201,0.287985,0.434101,Europe
...,...,...,...,...,...,...,...,...,...
148,Central African Republic,3.4759,0.041072,0.000000,0.000000,0.292814,0.253513,0.028265,Africa
149,Rwanda,3.3123,0.343243,0.522876,0.572383,0.604088,0.235705,0.485542,Africa
150,Zimbabwe,3.2992,0.425564,1.047835,0.375038,0.377405,0.151349,0.080929,Africa
151,South Sudan,2.8166,0.289083,0.553279,0.208809,0.065609,0.209935,0.111157,Africa


Посмотрим, где остались недозаполненные значения

In [None]:
df20['country'][df20['continent'] == 0]

24      Taiwan Province of China
34                        Kosovo
60                   South Korea
73                    Kyrgyzstan
75                  North Cyprus
77     Hong Kong S.A.R. of China
84                   Ivory Coast
87           Congo (Brazzaville)
103                         Laos
124      Palestinian Territories
130             Congo (Kinshasa)
Name: country, dtype: object

Применим нашу функцию заполнения остатков

In [None]:
df20 = fill_continent(df20)

Посмотрим, что осталось дозаполнить вручную

In [None]:
df20['country'][df20['continent'] == 0]

24     Taiwan Province of China
77    Hong Kong S.A.R. of China
Name: country, dtype: object

Заполним, исходя из общедоступной информации

In [None]:
df20['continent'][df20['country'] == 'Taiwan Province of China'] = 'Asia'

In [None]:
df20['continent'][df20['country'] == 'Hong Kong S.A.R. of China'] = 'Asia'

Убедимся, что пропусков не осталось

In [None]:
df20['country'][df20['continent'] == 0]

Series([], Name: country, dtype: object)

##### 2021 год

In [None]:
df21 = add_continent(df21)
df21

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption,continent
0,Finland,7.842,1.446,1.106,0.741,0.691,0.124,0.481,Europe
1,Denmark,7.620,1.502,1.108,0.763,0.686,0.208,0.485,Europe
2,Switzerland,7.571,1.566,1.079,0.816,0.653,0.204,0.413,Europe
3,Iceland,7.554,1.482,1.172,0.772,0.698,0.293,0.170,Europe
4,Netherlands,7.464,1.501,1.079,0.753,0.647,0.302,0.384,North America
...,...,...,...,...,...,...,...,...,...
144,Lesotho,3.512,0.451,0.731,0.007,0.405,0.103,0.015,Africa
145,Botswana,3.467,1.099,0.724,0.340,0.539,0.027,0.088,Africa
146,Rwanda,3.415,0.364,0.202,0.407,0.627,0.227,0.493,Africa
147,Zimbabwe,3.145,0.457,0.649,0.243,0.359,0.157,0.075,Africa


Посмотрим, в каких строках остались незаполненные данные

In [None]:
df21['country'][df21['continent'] == 0]

23      Taiwan Province of China
32                        Kosovo
61                   South Korea
66                    Kyrgyzstan
73                  North Cyprus
76     Hong Kong S.A.R. of China
82           Congo (Brazzaville)
84                   Ivory Coast
93               North Macedonia
99                          Laos
124      Palestinian Territories
Name: country, dtype: object

Применим нашу функцию, которая заполняет остатки

In [None]:
df21 = fill_continent(df21)

Проверим, какие данные остались недозаполненными

In [None]:
df21['country'][df21['continent'] == 0]

23     Taiwan Province of China
76    Hong Kong S.A.R. of China
93              North Macedonia
Name: country, dtype: object

Заполним вручную

In [None]:
df21['continent'][df21['country'] == 'Taiwan Province of China'] = 'Asia'

In [None]:
df21['continent'][df21['country'] == 'Hong Kong S.A.R. of China'] = 'Asia'

In [None]:
df21['continent'][df21['country'] == 'North Macedonia'] = 'Europe'

Убедимся, что теперь у всех стран есть континенты

In [None]:
df21['country'][df21['continent'] == 0]

Series([], Name: country, dtype: object)

##### 2022 год

In [None]:
df22 = add_continent(df22)
df22

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption,continent
0,Finland,7821,1892,1258,0775,0736,0109,0534,Europe
1,Denmark,7636,1953,1243,0777,0719,0188,0532,Europe
2,Iceland,7557,1936,1320,0803,0718,0270,0191,Europe
3,Switzerland,7512,2026,1226,0822,0677,0147,0461,Europe
4,Netherlands,7415,1945,1206,0787,0651,0271,0419,North America
...,...,...,...,...,...,...,...,...,...
142,Rwanda*,3268,0785,0133,0462,0621,0187,0544,0
143,Zimbabwe,2995,0947,0690,0270,0329,0106,0105,Africa
144,Lebanon,2955,1392,0498,0631,0103,0082,0034,Asia
145,Afghanistan,2404,0758,0000,0289,0000,0089,0005,Asia


Посмотрим на пропуски

In [None]:
df22['country'][df22['continent'] == 0]

5                    Luxembourg*
17                       Czechia
25      Taiwan Province of China
31                        Kosovo
38                    Guatemala*
49                       Kuwait*
58                   South Korea
63                    Kyrgyzstan
64                      Belarus*
77                 Turkmenistan*
78                 North Cyprus*
80     Hong Kong S.A.R. of China
85                        Libya*
87                   Ivory Coast
88               North Macedonia
91                   Azerbaijan*
92                       Gambia*
94                          Laos
96                      Liberia*
103                       Niger*
115                     Comoros*
121     Palestinian Territories*
124        Eswatini, Kingdom of*
127                  Madagascar*
129                        Chad*
131                       Yemen*
132                  Mauritania*
140                     Lesotho*
141                    Botswana*
142                      Rwanda*
146       

Применим нашу функцию заполнения остатков

In [None]:
df22 = fill_continent(df22)

Ещё раз посмотрим на пропуски

In [None]:
df22['country'][df22['continent'] == 0]

5                    Luxembourg*
17                       Czechia
25      Taiwan Province of China
38                    Guatemala*
49                       Kuwait*
64                      Belarus*
77                 Turkmenistan*
78                 North Cyprus*
80     Hong Kong S.A.R. of China
85                        Libya*
88               North Macedonia
91                   Azerbaijan*
92                       Gambia*
96                      Liberia*
103                       Niger*
115                     Comoros*
121     Palestinian Territories*
124        Eswatini, Kingdom of*
127                  Madagascar*
129                        Chad*
131                       Yemen*
132                  Mauritania*
140                     Lesotho*
141                    Botswana*
142                      Rwanda*
146                           xx
Name: country, dtype: object

Переименуем страны без использования *, чтобы функции могли работать

In [None]:
df22['country'][df22['country'] == 'Luxembourg*'] = 'Luxembourg'
df22['country'][df22['country'] == 'Guatemala*'] = 'Guatemala'
df22['country'][df22['country'] == 'Kuwait*'] = 'Kuwait'
df22['country'][df22['country'] == 'Belarus*'] = 'Belarus'
df22['country'][df22['country'] == 'Turkmenistan*'] = 'Turkmenistan'
df22['country'][df22['country'] == 'North Cyprus*'] = 'North Cyprus'
df22['country'][df22['country'] == 'Libya*'] = 'Libya'
df22['country'][df22['country'] == 'Azerbaijan*'] = 'Azerbaijan'
df22['country'][df22['country'] == 'Gambia*'] = 'Gambia'
df22['country'][df22['country'] == 'Niger*'] = 'Niger'
df22['country'][df22['country'] == 'Liberia*'] = 'Liberia'
df22['country'][df22['country'] == 'Comoros*'] = 'Comoros'
df22['country'][df22['country'] == 'Palestinian Territories*'] = 'Palestinian Territories'
df22['country'][df22['country'] == 'Eswatini, Kingdom of*'] = 'Eswatini, Kingdom of'
df22['country'][df22['country'] == 'Madagascar*'] = 'Madagascar'
df22['country'][df22['country'] == 'Chad*'] = 'Chad'
df22['country'][df22['country'] == 'Yemen*'] = 'Yemen'
df22['country'][df22['country'] == 'Mauritania*'] = 'Mauritania'
df22['country'][df22['country'] == 'Lesotho*'] = 'Lesotho'
df22['country'][df22['country'] == 'Botswana*'] = 'Botswana'
df22['country'][df22['country'] == 'Rwanda*'] = 'Rwanda'

Ещё раз применим функции

In [None]:
df22 = add_continent(df22)

In [None]:
df22['country'][df22['continent'] == 0]

17                       Czechia
25      Taiwan Province of China
31                        Kosovo
58                   South Korea
63                    Kyrgyzstan
78                  North Cyprus
80     Hong Kong S.A.R. of China
87                   Ivory Coast
88               North Macedonia
94                          Laos
121      Palestinian Territories
124         Eswatini, Kingdom of
146                           xx
Name: country, dtype: object

In [None]:
df22 = fill_continent(df22)

Посмотрим, что осталось дозаполнить

In [None]:
df22['country'][df22['continent'] == 0]

17                       Czechia
25      Taiwan Province of China
80     Hong Kong S.A.R. of China
88               North Macedonia
124         Eswatini, Kingdom of
146                           xx
Name: country, dtype: object

Дозаполним вручную, а последнюю строчку выбросим из датасета

In [None]:
df22['continent'][df22['country'] == 'Czechia'] = 'Europe'

In [None]:
df22['continent'][df22['country'] == 'Taiwan Province of China'] = 'Asia'

In [None]:
df22['continent'][df22['country'] == 'Hong Kong S.A.R. of China'] = 'Asia'

In [None]:
df22['continent'][df22['country'] == 'North Macedonia'] = 'Europe'

In [None]:
df22['continent'][df22['country'] == 'Eswatini, Kingdom of'] = 'Africa'

In [None]:
df22 = df22.loc[df22['country'] != 'xx']

Убедимсся, что у всех стран теперь есть континенты

In [None]:
df22['country'][df22['continent'] == 0]

Series([], Name: country, dtype: object)

#### Изменение типов данных

Ещё раз взглянем на получившийся датафрейм

In [None]:
df22

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption,continent
0,Finland,7821,1892,1258,0775,0736,0109,0534,Europe
1,Denmark,7636,1953,1243,0777,0719,0188,0532,Europe
2,Iceland,7557,1936,1320,0803,0718,0270,0191,Europe
3,Switzerland,7512,2026,1226,0822,0677,0147,0461,Europe
4,Netherlands,7415,1945,1206,0787,0651,0271,0419,North America
...,...,...,...,...,...,...,...,...,...
141,Botswana,3471,1503,0815,0280,0571,0012,0102,Africa
142,Rwanda,3268,0785,0133,0462,0621,0187,0544,Africa
143,Zimbabwe,2995,0947,0690,0270,0329,0106,0105,Africa
144,Lebanon,2955,1392,0498,0631,0103,0082,0034,Asia


Заметим, что в качестве знака разделителя во float используется запятая, а не точка. Из-за этого при составлении общего датасета возникнут проблемы с типами данных. Лучше поменять разделитель и перевести всё во float.

In [None]:
df22['score'] = [x.replace(',', '.') for x in df22['score']]
df22['score'] = df22['score'].astype(float)
df22['GDP'] = [x.replace(',', '.') for x in df22['GDP']]
df22['GDP'] = df22['GDP'].astype(float)
df22['support'] = [x.replace(',', '.') for x in df22['support']]
df22['support'] = df22['support'].astype(float)
df22['health'] = [x.replace(',', '.') for x in df22['health']]
df22['health'] = df22['health'].astype(float)
df22['freedom'] = [x.replace(',', '.') for x in df22['freedom']]
df22['freedom'] = df22['freedom'].astype(float)
df22['generosity'] = [x.replace(',', '.') for x in df22['generosity']]
df22['generosity'] = df22['generosity'].astype(float)
df22['corruption'] = [x.replace(',', '.') for x in df22['corruption']]
df22['corruption'] = df22['corruption'].astype(float)

Проверим, что теперь используются не запятые, а точки

In [None]:
df22

Unnamed: 0,country,score,GDP,support,health,freedom,generosity,corruption,continent
0,Finland,7.821,1.892,1.258,0.775,0.736,0.109,0.534,Europe
1,Denmark,7.636,1.953,1.243,0.777,0.719,0.188,0.532,Europe
2,Iceland,7.557,1.936,1.320,0.803,0.718,0.270,0.191,Europe
3,Switzerland,7.512,2.026,1.226,0.822,0.677,0.147,0.461,Europe
4,Netherlands,7.415,1.945,1.206,0.787,0.651,0.271,0.419,North America
...,...,...,...,...,...,...,...,...,...
141,Botswana,3.471,1.503,0.815,0.280,0.571,0.012,0.102,Africa
142,Rwanda,3.268,0.785,0.133,0.462,0.621,0.187,0.544,Africa
143,Zimbabwe,2.995,0.947,0.690,0.270,0.329,0.106,0.105,Africa
144,Lebanon,2.955,1.392,0.498,0.631,0.103,0.082,0.034,Asia


#### Добавление года

Добавим в наши таблицы столбец с годом

In [None]:
df15['year'] = [2015] * len(df15['country'])

In [None]:
df16['year'] = [2016] * len(df16['country'])

In [None]:
df17['year'] = [2017] * len(df17['country'])

In [None]:
df18['year'] = [2018] * len(df18['country'])

In [None]:
df19['year'] = [2019] * len(df19['country'])

In [None]:
df20['year'] = [2020] * len(df20['country'])

In [None]:
df21['year'] = [2021] * len(df21['country'])

In [None]:
df22['year'] = [2022] * len(df22['country'])

#### Объединение данных в один датафрейм

И наконец создадим общий датафрейм из всех таблиц

In [None]:
full_df = pd.concat([df15, df16, df17, df18, df19, df20, df21, df22], axis=0)

In [None]:
full_df

Unnamed: 0,country,score,GDP,support,health,freedom,corruption,generosity,continent,year
0,Switzerland,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,Europe,2015
1,Iceland,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,Europe,2015
2,Denmark,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,Europe,2015
3,Norway,7.522,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,Europe,2015
4,Canada,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,North America,2015
...,...,...,...,...,...,...,...,...,...,...
141,Botswana,3.471,1.50300,0.81500,0.28000,0.57100,0.10200,0.01200,Africa,2022
142,Rwanda,3.268,0.78500,0.13300,0.46200,0.62100,0.54400,0.18700,Africa,2022
143,Zimbabwe,2.995,0.94700,0.69000,0.27000,0.32900,0.10500,0.10600,Africa,2022
144,Lebanon,2.955,1.39200,0.49800,0.63100,0.10300,0.03400,0.08200,Asia,2022


#### Подготовка общего датафрейма

Изучим получившийся датафрейм

In [None]:
full_df

Unnamed: 0,country,score,GDP,support,health,freedom,corruption,generosity,continent,year
0,Switzerland,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,Europe,2015
1,Iceland,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,Europe,2015
2,Denmark,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,Europe,2015
3,Norway,7.522,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,Europe,2015
4,Canada,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,North America,2015
...,...,...,...,...,...,...,...,...,...,...
141,Botswana,3.471,1.50300,0.81500,0.28000,0.57100,0.10200,0.01200,Africa,2022
142,Rwanda,3.268,0.78500,0.13300,0.46200,0.62100,0.54400,0.18700,Africa,2022
143,Zimbabwe,2.995,0.94700,0.69000,0.27000,0.32900,0.10500,0.10600,Africa,2022
144,Lebanon,2.955,1.39200,0.49800,0.63100,0.10300,0.03400,0.08200,Asia,2022


Посмотрим общую информацию о нём

In [None]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1230 entries, 0 to 145
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     1230 non-null   object 
 1   score       1230 non-null   float64
 2   GDP         1230 non-null   float64
 3   support     1230 non-null   float64
 4   health      1230 non-null   float64
 5   freedom     1230 non-null   float64
 6   corruption  1229 non-null   float64
 7   generosity  1230 non-null   float64
 8   continent   1230 non-null   object 
 9   year        1230 non-null   int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 105.7+ KB


Мы видим пропуск в столбце corruption. Избавимся от него методом dropna

In [None]:
full_df = full_df.dropna()

Посмотрим обновлённую информацию о таблице

In [None]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1229 entries, 0 to 145
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     1229 non-null   object 
 1   score       1229 non-null   float64
 2   GDP         1229 non-null   float64
 3   support     1229 non-null   float64
 4   health      1229 non-null   float64
 5   freedom     1229 non-null   float64
 6   corruption  1229 non-null   float64
 7   generosity  1229 non-null   float64
 8   continent   1229 non-null   object 
 9   year        1229 non-null   int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 105.6+ KB


Посмотрим основные характеристики для числовых величин

In [None]:
full_df.describe()

Unnamed: 0,score,GDP,support,health,freedom,corruption,generosity,year
count,1229.0,1229.0,1229.0,1229.0,1229.0,1229.0,1229.0,1229.0
mean,5.428998,0.975343,1.033193,0.608082,0.44096,0.130756,0.201595,2018.447518
std,1.115155,0.434249,0.329578,0.24134,0.154412,0.110818,0.115568,2.28361
min,2.404,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
25%,4.584,0.668,0.828,0.439,0.345,0.056,0.118,2016.0
50%,5.41,1.01216,1.069336,0.639333,0.459,0.096,0.187,2018.0
75%,6.223,1.295843,1.27385,0.79081,0.559,0.164,0.258538,2020.0
max,7.842,2.209,1.644,1.141,0.74,0.587,0.838075,2022.0


Таким образом, мы видим, какие стоит брать границы для графика

### Визуализация данных

Данные готовы к тому, чтобы строить из них графики

#### Диаграммы рассеяния

##### Изменение уровня счастья в зависимости от ВВП с течением времени

In [None]:
px.scatter(full_df, 
           x="GDP",
           y="score", 
           animation_frame="year", 
           animation_group="country",
           color="continent", 
           hover_name="country", # то, что будет появляться, когда мы наводим мышку на столбец
           range_x=[0,3], 
           range_y=[0,8])

##### Изменение уровня счастья в зависимости от социальной поддержки с течением времени

In [None]:
px.scatter(full_df, 
           x="support",
           y="score", 
           animation_frame="year", 
           animation_group="country",
           color="continent", 
           hover_name="country", # то, что будет появляться, когда мы наводим мышку на столбец 
           range_x=[0,2], 
           range_y=[0,8])

##### Изменение уровня счастья в зависимости от уровня здоровья с течением времени

In [None]:
px.scatter(full_df, 
           x="health",
           y="score", 
           animation_frame="year", 
           animation_group="country",
           color="continent", 
           hover_name="country", # то, что будет появляться, когда мы наводим мышку на столбец 
           range_x=[0,2], 
           range_y=[0,8])

##### Изменение уровня счастья в зависимости от свободы с течением времени

In [None]:
px.scatter(full_df, 
           x="freedom",
           y="score", 
           animation_frame="year", 
           animation_group="country",
           color="continent", 
           hover_name="country", # то, что будет появляться, когда мы наводим мышку на столбец 
           range_x=[0,1], 
           range_y=[0,8])

##### Изменение уровня счастья в зависимости от щедрости с течением времени

In [None]:
px.scatter(full_df, 
           x="generosity",
           y="score", 
           animation_frame="year", 
           animation_group="country",
           color="continent", 
           hover_name="country", # то, что будет появляться, когда мы наводим мышку на столбец 
           range_x=[0,1], 
           range_y=[0,8])

##### Изменение уровня счастья в зависимости от уровня коррупции с течением времени

In [None]:
px.scatter(full_df, 
           x="corruption",
           y="score", 
           animation_frame="year", 
           animation_group="country",
           color="continent", 
           hover_name="country", # то, что будет появляться, когда мы наводим мышку на столбец 
           range_x=[0,1], 
           range_y=[0,8])

#### Хитмеп

In [None]:
fig = px.imshow(full_df[['GDP',	'support',	'health',	'freedom',	'corruption',	'generosity']], text_auto=True, aspect="auto")
fig.show()

#### Корреляционная матрица

In [None]:
corr_matrix = full_df.corr()

In [None]:
corr_matrix

Unnamed: 0,score,GDP,support,health,freedom,corruption,generosity,year
score,1.0,0.742438,0.629771,0.727904,0.566554,0.407241,0.09144,0.057834
GDP,0.742438,1.0,0.493423,0.705941,0.408341,0.329331,-0.117928,0.228191
support,0.629771,0.493423,1.0,0.622583,0.331743,0.127802,0.007587,-0.049405
health,0.727904,0.705941,0.622583,1.0,0.33592,0.274839,-0.010325,0.00924
freedom,0.566554,0.408341,0.331743,0.33592,1.0,0.441157,0.191387,0.236795
corruption,0.407241,0.329331,0.127802,0.274839,0.441157,1.0,0.244566,0.018852
generosity,0.09144,-0.117928,0.007587,-0.010325,0.191387,0.244566,1.0,-0.26342
year,0.057834,0.228191,-0.049405,0.00924,0.236795,0.018852,-0.26342,1.0


In [None]:
fig = px.imshow(corr_matrix, text_auto=True, aspect="auto")
fig.show()

#### Гистограммы



In [None]:
fig = make_subplots(rows=4, cols=2)

trace0 = go.Histogram(x=full_df['score'], name='score')
trace1 = go.Histogram(x=full_df['GDP'], name='GDP')
trace2 = go.Histogram(x=full_df['support'], name='support')
trace3 = go.Histogram(x=full_df['health'], name='health')
trace4 = go.Histogram(x=full_df['freedom'], name='freedom')
trace5 = go.Histogram(x=full_df['generosity'], name='generosity')
trace6 = go.Histogram(x=full_df['corruption'], name='corruption')

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 2, 2)
fig.append_trace(trace4, 3, 1)
fig.append_trace(trace5, 3, 2)
fig.append_trace(trace6, 4, 1)

fig.show()

#### Карты

In [None]:
def draw_map(df, year):
  print('Уровень счастья в мире на', str(year), 'год на карте',)
  data = dict(
          type = 'choropleth',
          colorscale = 'Viridis',
          marker_line_width=1,
          locations = df['country'],
          locationmode = "country names",
          z = df['score'],
          text = df['country'],
          colorbar = {'title' : 'Happiness Score'},
          
        )
  layout = dict(geo = dict(projection = {'type':'mercator'}, showocean = False, showlakes = True, showrivers = True ))
  choromap = go.Figure(data = [data],layout = layout)
  iplot(choromap,validate=False)

In [None]:
new_dict_df = {
    '2015': df15,
    '2016': df16,
    '2017': df17,
    '2018': df18,
    '2019': df19,
    '2020': df20,
    '2021': df21,
    '2022': df22
}

In [None]:
for item in new_dict_df.items():
  draw_map(item[1], item[0])

Уровень счастья в мире на 2015 год на карте


Уровень счастья в мире на 2016 год на карте


Уровень счастья в мире на 2017 год на карте


Уровень счастья в мире на 2018 год на карте


Уровень счастья в мире на 2019 год на карте


Уровень счастья в мире на 2020 год на карте


Уровень счастья в мире на 2021 год на карте


Уровень счастья в мире на 2022 год на карте


In [None]:
def happiness_change(dict_df, year_from='2015', year_to='2022', threshold=0.01):
    dataframes = []
    
    for year, df in dict_df.items():
        temp = df.copy()
        temp['year'] = year
        dataframes.append(temp)
    
    df_change = pd.concat(dataframes)
    yfrom = dict_df[year_from]
    yto = dict_df[year_to]
    
    df_change['change'] = (yto['score'] - yfrom['score']) / yfrom['score']

    temp = df_change[np.abs(df_change['change']) > threshold]
    temp = df_change.sort_values('change')
    temp['year'] = temp['year'].astype(str)
    
    fig = px.bar(temp,
                 x = 'change',
                 y = 'country',
                 color = 'year',
                 orientation = 'h',
                 height = 1000,
                 template = 'gridon',
                 title = f'How happiness changed from year {year_from} to {year_to}')
    fig.show()
    return temp

In [None]:
change = happiness_change(new_dict_df)

## ML

Перейдём к машинному обучению. По имеющимся у нас признакам предскажем уровень счастья.

### Кодирование признаков

Сохраним отдельно датафрейм с незакодированными признаками

In [None]:
full_df_cat = full_df.copy()

Поскольку многие модели машинного обучения умеют работать только с числовыми признаками, категориальные признаки, такие как "страна" и "континент" закодируем.

Значений признака "страна" очень много, поэтому используем технику LabelEncoding. Таким образом, мы сможем избежать большой разряженности таблицы

In [None]:
labelencoder = LabelEncoder()
full_df['country'] = labelencoder.fit_transform(full_df['country'])

А значений признака "континент" не так много. Используем One-hot-encoding с аргументом drop_first=True, чтобы избежать мультиколлинеарности

In [None]:
full_df = pd.get_dummies(full_df, drop_first=True)

Посмотрим на получившицся датафрейм

In [None]:
full_df

Unnamed: 0,country,score,GDP,support,health,freedom,corruption,generosity,year,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America
0,150,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2015,0,1,0,0,0
1,65,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2015,0,1,0,0,0
2,40,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2015,0,1,0,0,0
3,116,7.522,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,2015,0,1,0,0,0
4,25,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2015,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,18,3.471,1.50300,0.81500,0.28000,0.57100,0.10200,0.01200,2022,0,0,0,0,0
142,130,3.268,0.78500,0.13300,0.46200,0.62100,0.54400,0.18700,2022,0,0,0,0,0
143,174,2.995,0.94700,0.69000,0.27000,0.32900,0.10500,0.10600,2022,0,0,0,0,0
144,84,2.955,1.39200,0.49800,0.63100,0.10300,0.03400,0.08200,2022,1,0,0,0,0


### Деление данных на выборки

Для начала выделим отдельно целевой признак

In [None]:
target = full_df['score']
features = full_df.drop('score', axis=1)

In [None]:
target_cat = full_df_cat['score']
features_cat = full_df_cat.drop('score', axis=1)

Теперь разделим данные на обучающую и валидационную выборку в соотношении 2:1

In [None]:
features_train, features_valid, target_train, target_valid = train_test_split(features, target, test_size=0.33, random_state=42)


In [None]:
features_train_cat, features_valid_cat, target_train_cat, target_valid_cat = train_test_split(features_cat, target_cat, test_size=0.33, random_state=42)


Посмотрим на размеры получившихся выборок

In [None]:
print("Размер обучающей выборки:", target_train.count())
print("Размер валидационной выборки:", target_valid.count())

Размер обучающей выборки: 823
Размер валидационной выборки: 406


Размеры соотвествуют процентам.

### Масштабирование признаков

Попробуем отмасштабировать признаки. На всякий случай сохраним отмасштабированные значения в отдельные переменные и попробуем обучить модели как на отмасштабированных данных, так и нет

In [None]:
scaler = StandardScaler()
scaler.fit(features_train)
features_train_scaled = scaler.transform(features_train)
features_valid_scaled = scaler.transform(features_valid)

### Обучение моделей

Напишем функцию, которая будет выводить основные метрики при проверке модели

In [None]:
def print_metrics(predicted_valid, target_valid):
    print("mse =", mean_squared_error(target_valid, predicted_valid))
    print("rmse =", mean_squared_error(target_valid, predicted_valid) ** 0.5)
    print("R2 =", r2_score(target_valid, predicted_valid))
    print("MAE =", mean_absolute_error(target_valid, predicted_valid))
    print()

#### Линейная регрессия

In [None]:
model_lr = LinearRegression()
model_lr.fit(features_train, target_train)
predicted_valid = model_lr.predict(features_valid)
print_metrics(predicted_valid, target_valid)
draw_lr = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_lr, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='Линейная регрессия')

mse = 0.2553169956332261
rmse = 0.5052890218807709
R2 = 0.7925987661488847
MAE = 0.39197638793640305



In [None]:
model_lr = LinearRegression()
model_lr.fit(features_train_scaled, target_train)
predicted_valid = model_lr.predict(features_valid_scaled)
print_metrics(predicted_valid, target_valid)
draw_lr = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_lr, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='Линейная регрессия')

mse = 0.2553169956332269
rmse = 0.5052890218807716
R2 = 0.7925987661488839
MAE = 0.39197638793640366



In [None]:
scores = cross_val_score(model_lr, features, target, cv=5)
final_score = pd.Series(scores).mean()
print(scores)
print(final_score)
draw_lr = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_lr, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='Линейная регрессия')

[0.69169911 0.83091664 0.77533382 0.78483897 0.57948583]
0.7324548743714294


#### Случайный лес

In [None]:
model_rf = RandomForestRegressor(random_state = 42)
model_rf.fit(features_train, target_train)
predicted_valid = model_rf.predict(features_valid)
print_metrics(predicted_valid, target_valid)
draw_rf = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_rf, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='Случайный лес')

mse = 0.1532755345775744
rmse = 0.3915041948403291
R2 = 0.8754899378643587
MAE = 0.3035692132813641



In [None]:
model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(features_train_scaled, target_train)
predicted_valid = model_rf.predict(features_valid_scaled)
print_metrics(predicted_valid, target_valid)
draw_rf = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_rf, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='Случайный лес')

mse = 0.15317034946967029
rmse = 0.3913698371996369
R2 = 0.8755753827095324
MAE = 0.3033210039336637



In [None]:
scores = cross_val_score(model_rf, features, target, cv=5)
final_score = pd.Series(scores).mean()
print(scores)
print(final_score)
draw_rf = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_rf, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='Случайный лес')

[0.74900938 0.88883058 0.90570905 0.82664051 0.56892146]
0.7878221973539723


#### Xgboost

In [None]:
model_xgb = XGBRegressor(random_state=42)
model_xgb.fit(features_train, target_train)
predicted_valid = model_xgb.predict(features_valid)
print_metrics(predicted_valid, target_valid)
draw_xgb = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_xgb, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='XGBoost')

mse = 0.17959596768074176
rmse = 0.4237876445588542
R2 = 0.8541091038640457
MAE = 0.32577174958462324



In [None]:
model_xgb = XGBRegressor(random_state=42)
model_xgb.fit(features_train_scaled, target_train)
predicted_valid = model_xgb.predict(features_valid_scaled)
print_metrics(predicted_valid, target_valid)
draw_xgb = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_xgb, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='XGBoost')

mse = 0.17959596768074176
rmse = 0.4237876445588542
R2 = 0.8541091038640457
MAE = 0.32577174958462324



#### Градиентный бустинг

In [None]:
model_grbr = GradientBoostingRegressor(random_state=42)
model_grbr.fit(features_train, target_train)
predicted_valid = model_grbr.predict(features_valid)
print_metrics(predicted_valid, target_valid)
draw_grbr = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_grbr, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='Градиентный бустинг')

mse = 0.181010586661839
rmse = 0.42545338952914574
R2 = 0.852959968760912
MAE = 0.3305288392696412



In [None]:
model_grbr = GradientBoostingRegressor(random_state=42)
model_grbr.fit(features_train_scaled, target_train)
predicted_valid = model_grbr.predict(features_valid_scaled)
print_metrics(predicted_valid, target_valid)
draw_grbr = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_grbr, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='Градиентный бустинг')

mse = 0.18112131461786235
rmse = 0.42558349899621617
R2 = 0.85287002130307
MAE = 0.33065897852937537



In [None]:
scores = cross_val_score(model_grbr, features, target, cv=5)
final_score = pd.Series(scores).mean()
print(scores)
print(final_score)
draw_grbr = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_grbr, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='Градиентный бустинг')

[0.70226254 0.85437388 0.86165735 0.83851872 0.5963078 ]
0.7706240587476809


#### Catboost

In [None]:
cat = CatBoostRegressor(random_state=42)
cat.fit(features_train, target_train,verbose=False)
predicted_valid = cat.predict(features_valid)
print_metrics(predicted_valid, target_valid)
draw_cat = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_cat, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='CatBoost')

mse = 0.10615612338936649
rmse = 0.32581608829118075
R2 = 0.9137663714191824
MAE = 0.24708691949256875



In [None]:
cat = CatBoostRegressor(random_state=42, cat_features=['country', 'continent'])
cat.fit(features_train_cat, target_train_cat,verbose=False)
predicted_valid = cat.predict(features_valid_cat)
print_metrics(predicted_valid, target_valid_cat)
draw_cat = pd.DataFrame({
    'Actual':target_valid_cat,
    'Predict':predicted_valid
})
px.scatter(draw_cat, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='CatBoost')

mse = 0.12131087393391309
rmse = 0.3482971058362574
R2 = 0.9014557379110235
MAE = 0.26241193955676645



In [None]:
cat = CatBoostRegressor(random_state=42)
cat.fit(features_train_scaled, target_train,verbose=False)
predicted_valid = cat.predict(features_valid_scaled)
print_metrics(predicted_valid, target_valid)
draw_cat = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_cat, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='CatBoost')

mse = 0.1061710079220979
rmse = 0.3258389294146694
R2 = 0.9137542802912646
MAE = 0.2471272686914673



In [None]:
scores = cross_val_score(cat, features, target, cv=5)
final_score = pd.Series(scores).mean()
print(scores)
print(final_score)
draw_cat = pd.DataFrame({
    'Actual':target_valid,
    'Predict':predicted_valid
})
px.scatter(draw_cat, 
           x='Actual', 
           y='Predict', 
           trendline='ols', 
           title='CatBoost')

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
6:	learn: 0.9308246	total: 16.5ms	remaining: 2.33s
7:	learn: 0.9092009	total: 18.7ms	remaining: 2.32s
8:	learn: 0.8889182	total: 21.1ms	remaining: 2.32s
9:	learn: 0.8684317	total: 23.4ms	remaining: 2.32s
10:	learn: 0.8486988	total: 25.7ms	remaining: 2.31s
11:	learn: 0.8309570	total: 28.1ms	remaining: 2.31s
12:	learn: 0.8137395	total: 30.4ms	remaining: 2.31s
13:	learn: 0.7962945	total: 32.7ms	remaining: 2.3s
14:	learn: 0.7791742	total: 34.8ms	remaining: 2.29s
15:	learn: 0.7643180	total: 37.2ms	remaining: 2.29s
16:	learn: 0.7497145	total: 39.5ms	remaining: 2.29s
17:	learn: 0.7351506	total: 41.9ms	remaining: 2.29s
18:	learn: 0.7198412	total: 44.2ms	remaining: 2.28s
19:	learn: 0.7074396	total: 47.8ms	remaining: 2.34s
20:	learn: 0.6940554	total: 50.1ms	remaining: 2.34s
21:	learn: 0.6818039	total: 52.5ms	remaining: 2.33s
22:	learn: 0.6695816	total: 55ms	remaining: 2.33s
23:	learn: 0.6577322	total: 57.3ms	remain