# Dataset Research

> We will use SQL queries just for experience.

In [15]:
import psycopg
import pandas as pd

In [16]:
with open("../../../JupyterFiles/dbpass.txt", "r") as pwrdfile:
    usrnm = pwrdfile.readline().strip()
    pwrd = pwrdfile.readline().strip()
    dbnm = pwrdfile.readline().strip()

In [17]:
with psycopg.connect(dbname=dbnm, user=usrnm, password=pwrd) as conn:
    with conn.cursor() as cur:

        cur.execute("SET search_path TO mvp_schema")

        cur.execute("""
            SELECT * FROM mvp_dataset md
            JOIN series_list sl on sl.series_id = md.series_id
            """)
        
        dataset = cur.fetchall()

        cur.execute("""SELECT attname AS column_name
FROM pg_catalog.pg_attribute
JOIN pg_catalog.pg_class ON pg_class.oid = pg_attribute.attrelid
JOIN pg_catalog.pg_namespace ON pg_namespace.oid = pg_class.relnamespace
WHERE pg_namespace.nspname = 'mvp_schema' AND pg_class.relname = 'mvp_dataset' AND pg_attribute.attnum > 0 AND NOT pg_attribute.attisdropped
UNION ALL
SELECT attname AS column_name
FROM pg_catalog.pg_attribute
JOIN pg_catalog.pg_class ON pg_class.oid = pg_attribute.attrelid
JOIN pg_catalog.pg_namespace ON pg_namespace.oid = pg_class.relnamespace
WHERE pg_namespace.nspname = 'mvp_schema' AND pg_class.relname = 'series_list' AND pg_attribute.attnum > 0 AND NOT pg_attribute.attisdropped;
""")

        colnames = cur.fetchall()

        df = pd.DataFrame(data=dataset)

In [18]:
coln = [tup[0] for tup in colnames]
df.columns = coln

In [19]:
df.head()

Unnamed: 0,country_code,series_id,2000,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,series_id.1,series_name
0,ARG,1,19.366,18.983,18.756,18.453,18.352,18.353,18.194,18.005,...,17.729,17.632,17.504,17.346,16.824,16.206,15.187,14.783,1,"Birth rate, crude (per 1,000 people)"
1,AUT,1,9.8,9.4,9.7,9.5,9.7,9.5,9.4,9.2,...,9.4,9.4,9.6,9.8,10.0,10.0,9.7,9.6,1,"Birth rate, crude (per 1,000 people)"
2,BLR,1,9.4,9.2,8.9,9.0,9.1,9.3,9.9,10.7,...,12.2,12.5,12.5,12.5,12.4,10.8,9.9,9.3,1,"Birth rate, crude (per 1,000 people)"
3,BEL,1,11.4,11.2,10.9,11.0,11.3,11.4,11.6,11.7,...,11.5,11.3,11.2,10.8,10.8,10.5,10.4,10.2,1,"Birth rate, crude (per 1,000 people)"
4,CRI,1,19.788,19.104,18.12,17.656,17.228,16.99,16.797,16.823,...,15.483,15.093,14.852,14.522,14.145,13.926,13.562,12.847,1,"Birth rate, crude (per 1,000 people)"


In [20]:
df.drop('series_id',axis=1,inplace=True)

In [21]:
df.head()

Unnamed: 0,country_code,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,series_name
0,ARG,19.366,18.983,18.756,18.453,18.352,18.353,18.194,18.005,17.812,...,17.806,17.729,17.632,17.504,17.346,16.824,16.206,15.187,14.783,"Birth rate, crude (per 1,000 people)"
1,AUT,9.8,9.4,9.7,9.5,9.7,9.5,9.4,9.2,9.3,...,9.3,9.4,9.4,9.6,9.8,10.0,10.0,9.7,9.6,"Birth rate, crude (per 1,000 people)"
2,BLR,9.4,9.2,8.9,9.0,9.1,9.3,9.9,10.7,11.1,...,11.5,12.2,12.5,12.5,12.5,12.4,10.8,9.9,9.3,"Birth rate, crude (per 1,000 people)"
3,BEL,11.4,11.2,10.9,11.0,11.3,11.4,11.6,11.7,11.9,...,11.7,11.5,11.3,11.2,10.8,10.8,10.5,10.4,10.2,"Birth rate, crude (per 1,000 people)"
4,CRI,19.788,19.104,18.12,17.656,17.228,16.99,16.797,16.823,16.899,...,15.772,15.483,15.093,14.852,14.522,14.145,13.926,13.562,12.847,"Birth rate, crude (per 1,000 people)"


So. The main question is do we have any correlations?

In [22]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr

  from scipy.stats.stats import pearsonr


In [23]:
df.country_code.unique()

array(['ARG', 'AUT', 'BLR', 'BEL', 'CRI', 'DNK', 'DOM', 'SLV', 'FIN',
       'GRC', 'LUX', 'MDA', 'NOR', 'PER', 'RUS', 'SWE', 'THA', 'USA'],
      dtype=object)

In [24]:
def dataset_analyzer():
    # Create a list with all dataframes of each country
    dfdict = {}
    for cc in df.country_code.unique():
        dfdict[cc] = df.loc[df.country_code == cc].drop(["country_code"], axis=1).set_index("series_name").transpose()

    # Changing data type to float
    for dataframe in dfdict.values():
        for col in dataframe.columns: dataframe[col] = dataframe[col].astype(float)

    # That piece is needed if you want to see full output. Showing all good correlations in datasets with pvalue > 5%:
    # for country, dataframe in dfdict.items():
    #     print(f"Country: {country}\n")
    #     print("Female population:\n")
    #     for col in dataframe.columns:
    #         if pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate, female (per 100,000 female population)'].to_list())[1]<0.05 and col.find("Suicide")==-1:
    #             print(f"{col},\n{pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate, female (per 100,000 female population)'].to_list())}\n")
    #     print("Male Population:\n")
    #     for col in dataframe.columns:
    #         if pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate, male (per 100,000 male population)'].to_list())[1]<0.05 and col.find("Suicide")==-1:
    #             print(f"{col},\n{pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate, male (per 100,000 male population)'].to_list())}\n")
    #     print("Full Population:\n")
    #     for col in dataframe.columns:
    #         if pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate (per 100,000 population)'].to_list())[1]<0.05 and col.find("Suicide")==-1:
    #             print(f"{col},\n{pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate (per 100,000 population)'].to_list())}\n")

    # We are creating data output here
    serieslist = {}
    serieslist["Female"] = {}
    serieslist["Male"] = {}
    serieslist["Full"] = {}
    for key in serieslist:
        for s in dfdict["ARG"].columns: 
            if s.find("Suicide")==-1:
                serieslist[key][s] = {}

    # Saving all good correlations in datasets with pvalue < 5%:
    for country, dataframe in dfdict.items():
        for col in dataframe.columns:
            pearsonr_val = pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate, female (per 100,000 female population)'].to_list())
            if pearsonr_val[1]<0.05 and col.find("Suicide")==-1 and (pearsonr_val[0]>0.5 or pearsonr_val[0]<-0.5):
                serieslist["Female"][col][country] = pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate, female (per 100,000 female population)'].to_list())
        for col in dataframe.columns:
            pearsonr_val = pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate, male (per 100,000 male population)'].to_list())
            if pearsonr_val[1]<0.05 and col.find("Suicide")==-1 and (pearsonr_val[0]>0.5 or pearsonr_val[0]<-0.5):
                serieslist["Male"][col][country] = pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate, male (per 100,000 male population)'].to_list())
        for col in dataframe.columns:
            pearsonr_val = pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate (per 100,000 population)'].to_list())
            if pearsonr_val[1]<0.05 and col.find("Suicide")==-1 and (pearsonr_val[0]>0.5 or pearsonr_val[0]<-0.5):
                serieslist["Full"][col][country] = pearsonr(dataframe[col].to_list(), dataframe['Suicide mortality rate (per 100,000 population)'].to_list())

    return serieslist

In [26]:
import numpy as np

correlations = dataset_analyzer()
for sex, series_data in correlations.items():
    print(f"{sex}\n")
    for series, country_data in series_data.items():
        if len(country_data) > 10:
            print(f"""{series}
            {len(country_data)}/{len(df.country_code.unique())} countries correlate
            Average stval = {np.mean([pearsres[0] for pearsres in country_data.values()])}
            Average pval = {np.mean([pearsres[1] for pearsres in country_data.values()])}""")

Female

Birth rate, crude (per 1,000 people)
            11/18 countries correlate
            Average stval = 0.06242567743033596
            Average pval = 0.004963032962615992
Domestic general government health expenditure per capita (current US$)
            12/18 countries correlate
            Average stval = -0.47299812612578945
            Average pval = 0.002729814968221174
GDP per capita (current US$)
            12/18 countries correlate
            Average stval = -0.45330964825986264
            Average pval = 0.0024830183122721157
Life expectancy at birth, female (years)
            11/18 countries correlate
            Average stval = -0.4379219506358206
            Average pval = 8.123928002066727e-05
Life expectancy at birth, male (years)
            12/18 countries correlate
            Average stval = -0.441965047631589
            Average pval = 0.0019204356110995762
Life expectancy at birth, total (years)
            11/18 countries correlate
            Average st

In [27]:
from scipy.spatial.distance import cdist