# Analisi strutturale dei datasets

In [1]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors
from itertools import combinations
import shutil
import seaborn as sns

In [2]:
lilla = "#c8a2c8"
color_of_bar1 = "#c8a2c8"
color_of_bar2 = "#6495ED"
color_of_edge = "#000000"

In [3]:
matrix = np.array([[149030, 34],[7442, 12]])
norm = matplotlib.colors.Normalize(matrix.min(), matrix.max())
boundaries = [value for value in matrix.flatten().tolist()]
list.sort(boundaries)
colors = [[norm(boundaries[0]), "#dcc4dc"], 
          [norm(boundaries[1]), "#c8a2c8"], 
          [norm(boundaries[2]), "#93779c"], 
          [norm(boundaries[3]), "#6c4675"]]
my_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

In [6]:
dataset_path = "..\\..\\Dataset\\DatasetJSON\\datasets.json"
plot_path = ".\\plot\\"
if os.path.exists(plot_path):
    shutil.rmtree(plot_path)
    os.mkdir(plot_path)
else:
    os.mkdir(plot_path)

### Import dataset JSON ed estrazione numero righe, colonne e celle per ogni dataset

In [7]:
with open(dataset_path, 'r', encoding='utf-8') as f:
    json_object = json.loads(f.read())
    f.close()

In [8]:
# dato un df restituisce numero di righe, colonne e celle
def extract_data_from_df(df):
    return {
        "rows": df.shape[0],
        "columns": df.shape[1],
        "cells": df.shape[0] * df.shape[1]
    }

In [9]:
info_datasets = []
for i in range(0, len(json_object)):
    df = pd.read_json(json_object[i]["dataset"], orient="index", encoding="utf-8")
    info = extract_data_from_df(df)
    info_datasets.append([
        json_object[i]["group_name"],
        json_object[i]["dataset_name"],
        info["rows"],
        info["columns"],
        info["cells"]
    ])

In [10]:
# costruisce un df a partire da un array di array e un numero c che rappresenta la colonna su cui ordinare
# reverse serve a ordinare in senso crescente o descrescente
def get_info_order_by(list_of_info, c, reverse=False):
    sorted_list = sorted(list_of_info, key = lambda x: x[c], reverse=reverse)
    df = pd.DataFrame(sorted_list, columns=["group_name", "dataset_name", "rows", "columns", "cells"])
    return df

In [11]:
df_info = get_info_order_by(info_datasets, 1, True)
display(df_info)

Unnamed: 0,group_name,dataset_name,rows,columns,cells
0,02-GioPonSpiz,wikipedia,2664,5,13320
1,05-MalPatSaj,wikipedia,3111,7,21777
2,06-MarScoToc,wikipedia,619,5,3095
3,10-DeBiGa,wikipedia,1002,13,13026
4,00-avengers,valuetoday,10682,18,192276
5,02-GioPonSpiz,valuetoday,10680,6,64080
6,03-gren,valuetoday,10682,10,106820
7,04-iGMM,valuetoday,10000,12,120000
8,07-silvestri,valuetoday,1000,9,9000
9,08-slytherin,valuetoday,10000,8,80000


In [12]:
sum(df_info["rows"])

185598

### Plot numero di valori distinti e nulli per colonna, per ogni dataset

In [28]:
unique_null_dict = []
for i in range(0, len(json_object)):
    
    df = pd.read_json(json_object[i]["dataset"], orient="index", encoding="utf-8")
    info = extract_data_from_df(df)

    tmp_unique = {}
    tmp_null = {}
    for key in df:
        try:
            unique_value = len(pd.unique(df[key]))
        except:
            unique_value = len(df[key])
        null_value = df[key].isnull().sum()
        tmp_unique[key] = unique_value
        tmp_null[key] = null_value
            
    unique_null_dict.append({
        "group_name": json_object[i]["group_name"],
        "dataset_name": json_object[i]["dataset_name"],
        "rows": info["rows"],
        "columns": info["columns"],
        "column_name": tmp_unique.keys(),
        "unique_value": tmp_unique,
        "null_value": tmp_null
    })

In [30]:
for item in unique_null_dict:
    item["unique_value"]["ROWS"] = item["rows"]
    item["null_value"]["ROWS"] = item["rows"]
    df = pd.DataFrame([item["unique_value"], item["null_value"]], index=["UNIQUE_VALUE", "NULL_VALUE"])
    temp_cols = df.columns.tolist()
    new_cols = temp_cols[-1:] + temp_cols[:-1]
    df = df[new_cols]
    print(item["group_name"] + "-" + item["dataset_name"])
    display(df)
    print("\n\n")

00-avengers-companiesmarketcap


Unnamed: 0,ROWS,id,name,rank,market_cap,country,share_price,change_1_day,change_1_year,categories
UNIQUE_VALUE,5897,5897,5896,5886,2131,40,3561,1257,4491,5897
NULL_VALUE,5897,0,0,0,0,156,0,0,249,878





00-avengers-disfold


Unnamed: 0,ROWS,id,name,official_name,headquarters_country,headquarters_continent,founded,employees,ceo,market_cap,categories,gbp
UNIQUE_VALUE,1000,1000,1000,127,392,89,399,772,794,977,1000,500
NULL_VALUE,1000,0,0,874,93,93,436,124,200,0,0,477





00-avengers-hithorizons


Unnamed: 0,ROWS,id,name,address,nation,hhid,industry,sic_code,type,est_of_ownership
UNIQUE_VALUE,667,667,667,575,667,667,9,166,3,130
NULL_VALUE,667,0,0,0,0,0,0,0,0,0





00-avengers-valuetoday


Unnamed: 0,ROWS,id,name,world_rank,annual_revenue_in_usd,annual_net_income_in_usd,annual_results_for_year_ending,total_assets_in_usd,total_liabilities_in_usd,total_equity_in_usd,headquarters_region_city,headquarters_country,headquarters_sub_region,headquarters_continent,company_business,number_of_employees,ceo,founders,company_website
UNIQUE_VALUE,10682,10682,10671,9978,4859,2368,145,6017,5115,4668,165,10682,18,6,10682,3765,10682,10682,10252
NULL_VALUE,10682,0,0,0,2386,2380,2332,2722,2723,2724,7302,18,48,23,169,5231,9433,10288,283





01-DDD-cbinsights


Unnamed: 0.1,ROWS,Unnamed: 0,name,valuation,dateJoined,country,city,industry,investors,founded,stage,totalRaised
UNIQUE_VALUE,1185,1185,1182,238,720,49,282,20,1168,37,105,1041
NULL_VALUE,1185,0,0,0,0,0,16,0,0,0,0,0





01-DDD-companiesmarketcap


Unnamed: 0.1,ROWS,Unnamed: 0,name,rank,marketcap,country,share_price,change1d,change1y,categories
UNIQUE_VALUE,7217,7217,7216,7190,2758,39,4285,1053,5247,1085
NULL_VALUE,7217,0,0,0,17,0,5,0,0,0





01-DDD-ft


Unnamed: 0.1,ROWS,Unnamed: 0,name,country,sector,absolute_growth_rate_pct,compound_annual_growth_rate_cagr_pct,revenue_2020_euro,revenue_2017_euro,number_of_employees_2020,number_of_employees_2017,founding_year
UNIQUE_VALUE,1000,1000,1000,30,39,994,931,999,999,226,123,43
NULL_VALUE,1000,0,0,0,0,0,0,0,0,0,0,0





01-DDD-teamblind


Unnamed: 0.1,ROWS,Unnamed: 0,name,website,locations,size,industry,founded
UNIQUE_VALUE,946,946,946,946,401,8,72,143
NULL_VALUE,946,0,0,0,0,0,0,0





02-GioPonSpiz-companiesmarketcap


Unnamed: 0,ROWS,name,codice,pricecap,price,country
UNIQUE_VALUE,7200,7197,7200,2761,4235,70
NULL_VALUE,7200,0,0,0,0,0





02-GioPonSpiz-disfold


Unnamed: 0,ROWS,name,marketCap,stock,sector,industry,country
UNIQUE_VALUE,1000,1000,896,986,12,127,33
NULL_VALUE,1000,0,0,0,0,0,0





02-GioPonSpiz-valuetoday


Unnamed: 0,ROWS,name,world_rank,marketValue,marketCap,CEO,country
UNIQUE_VALUE,10680,10669,9976,6300,6475,1079,87
NULL_VALUE,10680,0,0,14,109,9586,0





02-GioPonSpiz-wikipedia


Unnamed: 0,ROWS,Name,Industry,Sector,Headquarters,Founded
UNIQUE_VALUE,2664,2663,31,222,873,257
NULL_VALUE,2664,0,0,0,0,0





03-gren-companiesmarketcap


Unnamed: 0,ROWS,name,market_capitalization_USD,price,country
UNIQUE_VALUE,7163,7160,2748,406,70
NULL_VALUE,7163,0,0,0,0





03-gren-disfold


Unnamed: 0,ROWS,name,market_capitalization_USD,stock,country,sector,industry
UNIQUE_VALUE,1000,1000,924,986,40,11,127
NULL_VALUE,1000,0,0,0,0,0,0





03-gren-ft


Unnamed: 0,ROWS,name,country,sector,revenue_2020_EU,revenue_2017_EU,employees_number_2020,employees_number_2017,founding_year
UNIQUE_VALUE,1000,1000,30,39,999,999,226,124,43
NULL_VALUE,1000,0,0,0,0,0,0,0,0





03-gren-valuetoday


Unnamed: 0,ROWS,name,annual_revenue_USD,annual_net_income_USD,market_capitalization_2022,employees_number,CEO,headquarters_country,wikipedia_page_url,twitter_page_url,facebook_page_url
UNIQUE_VALUE,10682,10671,4859,2368,5922,3765,1242,94,3182,980,685
NULL_VALUE,10682,0,0,0,0,0,0,0,0,0,0





04-iGMM-cbinsights


Unnamed: 0,ROWS,company,valuation,datejoined,country,city,industry,selectinvestors
UNIQUE_VALUE,1202,1199,237,721,50,283,20,1185
NULL_VALUE,1202,0,0,0,0,18,0,0





04-iGMM-companiesmarketcap


Unnamed: 0,ROWS,name,code,rank,marketcap,country,sharePrice,change(1day),change(1year),categories
UNIQUE_VALUE,7093,7091,7093,7067,2707,70,4140,1324,5240,7093
NULL_VALUE,7093,0,0,0,0,0,0,0,182,0





04-iGMM-disfold


Unnamed: 0,ROWS,name,code,rank,ceo,country,sector,industry,headquarters,employees,founded,marketcap,revenue,net income,link
UNIQUE_VALUE,1000,1000,986,1000,827,40,11,127,433,850,397,955,565,524,1000
NULL_VALUE,1000,0,0,0,165,0,0,0,5,42,421,0,436,475,0





04-iGMM-valuetoday


Unnamed: 0,ROWS,name,ceo,market value (Jan-07-2022),world rank (Jan-07-2022),market value (Jan 1st 2020),world rank (Jan-2020),headquarters country,number of employees,company business,annual revenue in USD,annual net income in USD,company website
UNIQUE_VALUE,10000,9989,1242,5806,9336,5548,8341,80,3711,10000,4810,2365,9619
NULL_VALUE,10000,0,8751,641,0,1627,1245,14,4665,0,2087,2081,249





05-MalPatSaj-companiesmarketcap


Unnamed: 0.1,ROWS,Unnamed: 0,Name,Symbol,MarketCap,Price,Country
UNIQUE_VALUE,7000,7000,6997,7000,2647,4158,70
NULL_VALUE,7000,0,0,1,0,0,1





05-MalPatSaj-disfold


Unnamed: 0.1,ROWS,Unnamed: 0,Name,MarketCap,Stock,Country,Sector,Industry
UNIQUE_VALUE,1000,1000,1000,924,986,40,11,127
NULL_VALUE,1000,0,0,0,1,0,0,0





05-MalPatSaj-forbes


Unnamed: 0.1,ROWS,Unnamed: 0,Name,Country,Sales,Profit,Assets,Market Value
UNIQUE_VALUE,2000,2000,2000,57,1599,1386,1796,1673
NULL_VALUE,2000,0,0,0,0,0,0,0





05-MalPatSaj-wikipedia


Unnamed: 0.1,ROWS,Unnamed: 0,Name,Industry,Sector,Headquarters,Founded,Notes
UNIQUE_VALUE,3111,3111,3107,41,261,1064,301,1896
NULL_VALUE,3111,0,0,0,1,0,1,12





06-MarScoToc-ambitiobox


Unnamed: 0,ROWS,Name,Industry,Headquarter,Ownership,Foundation Year
UNIQUE_VALUE,9899,9101,97,1856,19,233
NULL_VALUE,9899,0,206,428,1673,549





06-MarScoToc-companiesmarketcap


Unnamed: 0,ROWS,Name,Master Cap,Country,Rank,Share Price
UNIQUE_VALUE,6290,6288,2139,40,6289,4176
NULL_VALUE,6290,0,0,174,0,0





06-MarScoToc-disfold


Unnamed: 0,ROWS,Name,MasterCap,Country,Industry,Sector
UNIQUE_VALUE,960,960,772,1,123,11
NULL_VALUE,960,0,0,0,0,0





06-MarScoToc-wikipedia


Unnamed: 0,ROWS,Name,Industry,headQuarters,Founded,Area Served
UNIQUE_VALUE,619,614,290,258,540,69
NULL_VALUE,619,0,0,28,0,236





07-silvestri-disfold


Unnamed: 0,ROWS,name,country,market_value_apr_2022,sector,stock
UNIQUE_VALUE,1000,1000,33,896,12,985
NULL_VALUE,1000,0,15,38,15,1





07-silvestri-forbes


Unnamed: 0,ROWS,country,market_value_apr_2022,name,revenue_2022
UNIQUE_VALUE,1000,46,951,1000,935
NULL_VALUE,1000,0,0,0,0





07-silvestri-ft


Unnamed: 0,ROWS,country,employees_2017,employees_2020,founding_year,name,revenue_2017,revenue_2020,sector
UNIQUE_VALUE,1000,30,124,226,43,1000,999,999,39
NULL_VALUE,1000,0,3,4,0,0,0,0,0





07-silvestri-valuetoday


Unnamed: 0,ROWS,ceo,country,market_value_jan_2020,market_value_jan_2021,market_value_jan_2022,name,employees_2022,revenue_2022,sector
UNIQUE_VALUE,1000,304,28,887,916,956,999,647,912,43
NULL_VALUE,1000,697,0,89,57,0,0,281,17,1





08-slytherin-disfold


Unnamed: 0,ROWS,link,name,headquarters,employees,ceo,market_cap
UNIQUE_VALUE,16097,16097,16097,3130,5249,9309,13997
NULL_VALUE,16097,0,0,0,0,0,0





08-slytherin-forbes


Unnamed: 0,ROWS,link,name,industry,founded,country,ceo,employees,revenue
UNIQUE_VALUE,2000,2000,2000,48,216,57,1947,1617,645
NULL_VALUE,2000,0,0,0,0,0,0,0,0





08-slytherin-ft


Unnamed: 0,ROWS,link,name,country,industry,revenue,employees,founded
UNIQUE_VALUE,5000,4659,4217,48,44,3701,591,70
NULL_VALUE,5000,0,0,0,0,0,0,0





08-slytherin-valuetoday


Unnamed: 0,ROWS,link,rank,name,ceo,founded,revenue,country,industry
UNIQUE_VALUE,10000,10000,9336,9989,1356,230,4843,80,10000
NULL_VALUE,10000,0,0,0,0,0,0,0,0





09-wissel-ariregister


Unnamed: 0,ROWS,URL,ID,Name,Code,Legal form,Status,Registration Date,Capital,Address,Deletion Date
UNIQUE_VALUE,1469,1469,1469,1468,1466,10,3,1268,90,822,409
NULL_VALUE,1469,0,0,0,3,0,32,3,573,501,1011





09-wissel-companiesmarketcap


Unnamed: 0.1,ROWS,Unnamed: 0,URL,ID,Name,Company code,Marketcap,Share price,Earnings,Revenue,Shares,Employees
UNIQUE_VALUE,6560,5160,5313,4210,5304,5304,2253,3724,2708,2169,2036,2971
NULL_VALUE,6560,0,0,1409,9,10,26,13,54,292,4298,403





09-wissel-govuk


Unnamed: 0,ROWS,URL,ID,Name,Company ID,Company Status,Company Type,Registration Date,Incorporation Date,Dissolution Date,Office Address
UNIQUE_VALUE,1321,1321,1321,1320,1321,7,9,13,1134,47,1266
NULL_VALUE,1321,0,0,0,0,3,1,1306,15,1226,4





09-wissel-infoclipper


Unnamed: 0.1,ROWS,Unnamed: 0,INDEX,URL,Name,Trade Name,Address Name,Postalcode,City,State,Country,Location type
UNIQUE_VALUE,4656,4656,4655,3156,3153,425,2785,2927,1037,47,4,3
NULL_VALUE,4656,0,0,0,0,3979,0,0,0,0,0,256





10-DeBiGa-disfold


Unnamed: 0,ROWS,name,market_cap,stock,country,sector,industry,headquarters,founded,employees,ceo
UNIQUE_VALUE,1000,1000,924,986,40,19,138,433,407,850,827
NULL_VALUE,1000,0,0,0,0,0,0,0,0,0,0





10-DeBiGa-globaldata


Unnamed: 0,ROWS,name,headquarters,number_of_employees,address,industry,website,market_cap,telephone,revenue
UNIQUE_VALUE,5336,5335,82,4149,5308,30,5327,1347,5184,654
NULL_VALUE,5336,0,0,103,0,0,0,422,94,0





10-DeBiGa-govuk


Unnamed: 0,ROWS,name,company_number,registered_office_address,company_status,company_type,company_creation_date,nature_of_business
UNIQUE_VALUE,7000,6309,6321,5801,4,5,3423,458
NULL_VALUE,7000,0,0,0,0,0,0,0





10-DeBiGa-wikipedia


Unnamed: 0,ROWS,name,type,industry,founded,founders,headquarters,key_people,services,revenue,operating_income,total_assets,number_of_employees,website
UNIQUE_VALUE,1002,988,1002,1002,1002,1002,1002,1002,1002,1002,1002,1002,1002,1002
NULL_VALUE,1002,0,0,0,0,0,0,0,0,0,0,0,0,0





