In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from scipy.stats import pearsonr
import math
from ydata_profiling import ProfileReport

# Importing nobel per capita dataset

In [None]:
df_nobel = pd.read_csv("nobel_per_capita.csv", delimiter="\t")

In [None]:
profile = ProfileReport(df_nobel, title="Profiling Report")
profile.to_notebook_iframe()

# Plot the Nobel Laureates Ratio per country

In [None]:
df_nobel.plot(x="Entity", y="Laureates ratio", kind="barh", figsize=(14,14))

# Import the chocolate consumption dataset

In [None]:
# Cocoa beans consumption in kilograms per year per capita
df_cocoa_raw = pd.read_csv("chocolate_consumption_per_capita.csv")

## Preprocess the dataset

* pivot on year column
* replace NA by 0
* rename country according to the laureates dataframe

In [None]:
df_cocoa = df_cocoa_raw.pivot(index="Entity", columns="Year", values="Cocoa")
df_cocoa = df_cocoa.fillna(0.0)

# Czechia => Czech Republic
# Democratic Republic of Congo => DR Congo
# Timor => East Timor
# PLO => Palestin
df_cocoa.rename(
    index={
        "Czechia": "Czech Republic",
        "Democratic Republic of Congo": "DR Congo",
        "Timor": "East Timor",
    }, inplace=True)

## Intersection of countries

In [None]:
nobel_entities = set(df_nobel["Entity"].unique().tolist())
cocoa_entities = set(df_cocoa.index.unique().tolist())
common_entities = nobel_entities.intersection(cocoa_entities)

In [None]:
df_cocoa.loc[list(common_entities),2009].sort_values(ascending=False).plot(x="Entity", y="Cocoa", kind="barh", figsize=(14,14))

In [None]:
df_cocoa_2009 = df_cocoa.loc[list(common_entities),2009]
df_nobel_2015 = df_nobel[df_nobel["Entity"].isin(list(common_entities))]
df = pd.merge(df_cocoa_2009, df_nobel_2015, how="inner", on="Entity")
df = df.rename(columns={2009: "Cocoa"})

In [None]:
df_cocoa_ts = df_cocoa.loc[list(common_entities),:].T
df_cocoa_ts.index = pd.to_datetime(df_cocoa_ts.index, format="%Y")

In [None]:
profile_ts = ProfileReport(df_cocoa_ts, tsmode=True,  title="Time-Series EDA")
profile_ts.to_notebook_iframe()

# Report on merge datasets

In [None]:
ProfileReport(df[
    ["Entity", "Cocoa", "Laureates ratio", "Population"]
].rename(
    columns={"Entity": "Country", "Cocoa": "Cocoa Consumption"}
), title="Profiling Report").to_notebook_iframe()

In [None]:
df[df["Entity"] != "Saint Lucia"].plot(x="Cocoa", y="Laureates ratio", kind="scatter", figsize=(12,12))

In [None]:
x = df[df["Entity"] != "Saint Lucia"][["Cocoa"]]
y = df[df["Entity"] != "Saint Lucia"][["Laureates ratio"]]
linear_regression = linear_model.LinearRegression()
linear_regression.fit(x,y)

print(linear_regression.coef_, linear_regression.intercept_)

r_squared = linear_regression.score(x, y)
print(r_squared, math.sqrt(r_squared))

In [None]:
plt.scatter(x, y, color = 'red')
plt.plot(x, linear_regression.predict(x), color = 'blue')

In [None]:
df[["Cocoa", "Laureates ratio"]].corr(method="pearson")
df[["Cocoa", "Laureates ratio"]].corr(method="spearman")
p_result = pearsonr(x.values.ravel(), y.values.ravel())

p_ci = p_result.confidence_interval(confidence_level=0.95)
print(p_result.statistic, p_result.pvalue, p_ci)