# 02 - Data Analysis

Data Analysis on final DF

## Preliminaires 

### System 

In [None]:
cd ../

In [None]:
pwd

### Imports

In [None]:
import os, sys, logging
from IPython.display import display, HTML

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import missingno as msno

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

### Data

In [None]:
df = pd.read_csv("data/final/final.csv")

## First Tour

### Display

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.sample(10)

### Structure

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.dtypes.value_counts()

In [None]:
df.nunique()

### Missing Values

In [None]:
df

In [None]:
df.isna().sum(axis=1)

In [None]:
tmp = df.isna().sum(axis=1)
tmp[tmp == 0]

In [None]:
df = df.loc[tmp[tmp == 0].index]
df

### Data Inspection

In [None]:
df.describe().round(4)

In [None]:
df = df.loc[df["country name"] != "Malta", :]
df

## EDA

### Numercials

In [None]:
corr = df.select_dtypes(np.number).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(
    corr, mask=mask, cmap="coolwarm", vmin=-1, vmax=1, fmt=".2f", annot=True
)

In [None]:
sns.pairplot(df, corner=True)

In [None]:
sns.pairplot(df, corner=True, hue="uss30_region_name")

In [None]:
for c in df.select_dtypes(np.number).columns:
    fig = px.box(df, x="uss30_region_name", y=c)
    fig.show()

### Clustering

In [None]:
X_num = df.select_dtypes(np.number)
Xs = StandardScaler().fit_transform(X_num)
Xs = pd.DataFrame(Xs, columns=X_num.columns)
Xs.head()

In [None]:
score_list = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(Xs)
    labels = kmeans.predict(Xs)
    score_list.append(
        {
            "k": k,
            "interia": kmeans.inertia_,
            "bd": davies_bouldin_score(Xs, labels),
            "silhouette": silhouette_score(Xs, labels),
        }
    )

In [None]:
score_list = pd.DataFrame(score_list)
score_list

In [None]:
for kpi in score_list.columns.to_list()[1:]:
    fig = px.line(score_list, x="k", y=kpi, title=kpi)
    fig.show()

In [None]:
k = 6

kmeans = KMeans(n_clusters=k)
kmeans.fit(Xs)
labels = kmeans.predict(Xs)
df["cluster"] = labels
df.cluster = df.cluster.apply(lambda x: f"c_{x}")


df

In [None]:
df.sort_values(by="cluster", inplace=True, ascending=True)

for label in df.cluster.unique().tolist():
    print(f"Cluster {label}")
    print("____________________")
    display(df.loc[df.cluster == label, :])
    display(df.loc[df.cluster == label, :].describe().round(4))

In [None]:
for col in df.select_dtypes(np.number).columns.to_list():
    fig = px.box(df, x="cluster", y=col, title=col)
    fig.show()

### Maps

In [None]:
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
world.head()

In [None]:
eu_countries = df["country code"].values.tolist()
eu = world[world["iso_a3"].isin(eu_countries)]
eu.head()

In [None]:
merged_data = eu.merge(df, left_on="iso_a3", right_on="country code")
merged_data.head()

In [None]:
fig = px.choropleth(
    merged_data,
    geojson=merged_data.geometry,
    locations=merged_data.index,
    color="cluster",
    color_continuous_scale="Viridis",
    scope="europe",
    # labels={"your_GDP_column": "GDP"},
)
fig.show()

In [None]:
fig = px.choropleth(
    merged_data,
    geojson=merged_data.geometry,
    locations=merged_data.index,
    color="ms.km2",
    color_continuous_scale="Viridis",
    scope="europe",
    labels={"your_GDP_column": "GDP"},
)
fig.show()