# Analisi strutturale dei datasets

In [17]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors
from itertools import combinations
import shutil
import seaborn as sns

In [2]:
lilla = "#c8a2c8"
color_of_bar1 = "#c8a2c8"
color_of_bar2 = "#6495ED"
color_of_edge = "#000000"

In [19]:
matrix = np.array([[149030, 34],[7442, 12]])
norm = matplotlib.colors.Normalize(matrix.min(), matrix.max())
boundaries = [value for value in matrix.flatten().tolist()]
list.sort(boundaries)
colors = [[norm(boundaries[0]), "#dcc4dc"], 
          [norm(boundaries[1]), "#c8a2c8"], 
          [norm(boundaries[2]), "#93779c"], 
          [norm(boundaries[3]), "#6c4675"]]
my_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

In [3]:
dataset_path = "..\\..\\Dataset\\Original\\datasets.json"
plot_path = ".\\plot\\"
if os.path.exists(plot_path):
    shutil.rmtree(plot_path)
    os.mkdir(plot_path)
else:
    os.mkdir(plot_path)

### Import dataset JSON ed estrazione numero righe, colonne e celle per ogni dataset

In [4]:
with open(dataset_path, 'r', encoding='utf-8') as f:
    json_object = json.loads(f.read())
    f.close()

In [5]:
# dato un df restituisce numero di righe, colonne e celle
def extract_data_from_df(df):
    return {
        "rows": df.shape[0],
        "columns": df.shape[1],
        "cells": df.shape[0] * df.shape[1]
    }

In [6]:
info_datasets = []
for i in range(0, len(json_object)):
    df = pd.read_json(json_object[i]["dataset"], orient="index", encoding="utf-8")
    info = extract_data_from_df(df)
    info_datasets.append([
        json_object[i]["group_name"],
        json_object[i]["dataset_name"],
        info["rows"],
        info["columns"],
        info["cells"]
    ])

In [7]:
# costruisce un df a partire da un array di array e un numero c che rappresenta la colonna su cui ordinare
# reverse serve a ordinare in senso crescente o descrescente
def get_info_order_by(list_of_info, c, reverse=False):
    sorted_list = sorted(list_of_info, key = lambda x: x[c], reverse=reverse)
    df = pd.DataFrame(sorted_list, columns=["group_name", "dataset_name", "rows", "columns", "cells"])
    return df

In [30]:
df_info = get_info_order_by(info_datasets, 2, True)
display(df_info)

Unnamed: 0,group_name,dataset_name,rows,columns,cells
0,08-slytherin,disfold,16097,6,96582
1,00-avengers,valuetoday,10682,18,192276
2,03-gren,valuetoday,10682,10,106820
3,02-GioPonSpiz,valuetoday,10680,6,64080
4,04-iGMM,valuetoday,10000,12,120000
5,08-slytherin,valuetoday,10000,8,80000
6,06-MarScoToc,ambitiobox,9899,5,49495
7,01-DDD,companiesmarketcap,7217,9,64953
8,02-GioPonSpiz,companiesmarketcap,7200,5,36000
9,03-gren,companiesmarketcap,7163,4,28652


### Plot numero di valori distinti e nulli per colonna, per ogni dataset

In [24]:
unique_null_dict = []
for i in range(0, len(json_object)):
    
    df = pd.read_json(json_object[i]["dataset"], orient="index", encoding="utf-8")
    info = extract_data_from_df(df)

    tmp_unique = {}
    tmp_null = {}
    for key in df:
        try:
            unique_value = len(pd.unique(df[key]))
        except:
            unique_value = len(df[key])
        null_value = df[key].isnull().sum()
        tmp_unique[key] = unique_value
        tmp_null[key] = null_value
            
    unique_null_dict.append({
        "group_name": json_object[i]["group_name"],
        "dataset_name": json_object[i]["dataset_name"],
        "column_name": tmp_unique.keys(),
        "unique_value": tmp_unique,
        "null_value": tmp_null
    })

In [28]:
for item in unique_null_dict:
    if item["dataset_name"] == "companiesmarketcap":
        print(item["column_name"])

dict_keys(['id', 'name', 'rank', 'market_cap', 'country', 'share_price', 'change_1_day', 'change_1_year', 'categories'])
dict_keys(['Unnamed: 0', 'name', 'rank', 'marketcap', 'country', 'share_price', 'change1d', 'change1y', 'categories'])
dict_keys(['name', 'codice', 'pricecap', 'price', 'country'])
dict_keys(['name', 'market_capitalization_USD', 'price', 'country'])
dict_keys(['name', 'code', 'rank', 'marketcap', 'country', 'sharePrice', 'change(1day)', 'change(1year)', 'categories'])
dict_keys(['Unnamed: 0', 'Name', 'Symbol', 'MarketCap', 'Price', 'Country'])
dict_keys(['Name', 'Master Cap', 'Country', 'Rank', 'Share Price'])
dict_keys(['Unnamed: 0', 'URL', 'ID', 'Name', 'Company code', 'Marketcap', 'Share price', 'Earnings', 'Revenue', 'Shares', 'Employees'])


In [None]:
df =  pd.DataFrame(np.random.rand(25,4), columns=list("ABCD"))
df2 = pd.DataFrame(np.random.rand(25,4), columns=list("WXYZ"))

fig, (ax,ax2) = plt.subplots(ncols=2)
fig.subplots_adjust(wspace=0.01)
sns.heatmap(df, cmap="rocket", ax=ax, cbar=False)
fig.colorbar(ax.collections[0], ax=ax,location="left", use_gridspec=False, pad=0.2)
sns.heatmap(df2, cmap="icefire", ax=ax2, cbar=False)
fig.colorbar(ax2.collections[0], ax=ax2,location="right", use_gridspec=False, pad=0.2)
ax2.yaxis.tick_right()
ax2.tick_params(rotation=0)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,20)) 
title = "Distribuzione label per file"
file_name = "".join(title.lower()).replace(" ", "_")
ax.set_title(title)
ax.set_xlabel("Label")
ax.set_ylabel("File")
heatmap = sns.heatmap(df, ax=ax, annot=True, fmt=".0f", cmap=my_cmap2)
a = heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation = 0, fontsize = 12)
b = heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation = 0, fontsize = 12)
fig.savefig(file_name, bbox_inches='tight', transparent=True)