# Basic statistics

Let's explore how Desbordante can help you with collecting basic statistics (min, max, mean, etc.).

# Install necessary python dependencies

In [None]:
!pip install desbordante==2.3.2
!pip install pandas

Collecting desbordante==2.3.2
  Downloading desbordante-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading desbordante-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: desbordante
Successfully installed desbordante-2.3.2


# Import desbordante and pandas libraries

In [None]:
import desbordante as db
import pandas as pd

# Download dataset

In [None]:
!wget -q https://raw.githubusercontent.com/Desbordante/desbordante-core/main/examples/datasets/Workshop.csv

# Start exploring

Take a look at the source data.

In [None]:
dataset = pd.read_csv("Workshop.csv")
dataset

Unnamed: 0,id,worker_name,supervisor_surname,workshop,salary,job_post
0,404f50cb-caf0-4974-97f9-9463434537e1,Jennifer Moore,Galen Calla,Yogatacular,980,Client Solution Analyst
1,b5e38281-9c09-49bf-91f5-c55397df4d43,Edward Lee,Carrie Silvia,MonsterWorq,905,Front-End Loader Operator
2,972b299d-2f27-4d6d-81d2-8effbc543bf1,Brian Lee,Shena Desiree,Talkspiration,700,Farm Assistant
3,3241fb48-5a15-4638-bd68-d915834a3f89,Kenneth Turner,Paul Jeffry,Verbalthon,980,Client Solution Analyst
4,9cbb9026-f157-4a01-aace-a42b05ab2a28,Betty Campbell,Addyson Aaliyah,SpeakerAce,800,Physiotherapist
...,...,...,...,...,...,...
940,9cd700bc-b3d9-439d-afe9-945c2a20bc37,Richard Lopez,Galen Calla,Yogatacular,845,Senior Financial Planner
941,cc199ff4-453a-4ae5-9fbd-b45d72fa952a,Helen Rodriguez,Carrie Silvia,MonsterWorq,465,Electrician
942,de650347-880a-42a2-88c9-4329f26fb912,Karen White,Carrie Silvia,MonsterWorq,510,JavaScript Developer
943,ae604e24-e040-4d50-b685-5b4897ab9ae9,Charles Smith,Shena Desiree,Talkspiration,975,Store Manager


Let's compute some simple statistics over a given dataset both over the whole data and column-wise.

In [None]:
data_stats = db.statistics.algorithms.Default()
data_stats.load_data(table=dataset)
data_stats.execute()

table_methods = [
        ["Columns with null", data_stats.get_columns_with_null()],
        ["Columns with all unique values", data_stats.get_columns_with_all_unique_values()],
        ["Number of columns", data_stats.get_number_of_columns()]
]

column_methods = {
    "Avg": data_stats.get_average,
    "Sum of squares": data_stats.get_sum_of_squares,
    "Median": data_stats.get_median,
    "Min": data_stats.get_min,
    "Max": data_stats.get_max,
    "Distinct": data_stats.get_number_of_distinct,
    "Corrected std": data_stats.get_corrected_std,
    "Min chars in a row": data_stats.get_min_number_of_chars,
    "Max chars in a row": data_stats.get_max_number_of_chars,
    "Min words in a row": data_stats.get_min_number_of_words,
    "Max words in a row": data_stats.get_max_number_of_words,
    "Char vocabulary": data_stats.get_vocab,
    "Word vocabulary": data_stats.get_words
}

Visualize the table statistics using pandas DataFrame.

In [None]:
columns_names = ["Metrics", "Value"]
df = pd.DataFrame(table_methods, columns=columns_names)
df

Unnamed: 0,Metrics,Value
0,Columns with null,[]
1,Columns with all unique values,"[0, 1]"
2,Number of columns,6


Then, take a look at some statistics over each column.

In [None]:
# Compute overall number of columns
num_cols = data_stats.get_number_of_columns()

# Gather column statistics for every column
column_stats = []
for i in range(num_cols):
    col_stats = {}
    # For every method given...
    for description, method in column_methods.items():
        #... compute stats over the column given a column number...
        res = method(i)
        #... if there are results, then collect the statistic.
        if res is not None:
            col_stats[description] = res
    column_stats.append(col_stats)

# Display the result
df_column_stats = pd.DataFrame(column_stats).fillna('').T
pd.set_option('display.max_colwidth', 20)
df_column_stats

Unnamed: 0,0,1,2,3,4,5
Min,0008f14d-e2a7-45...,Anthony Campbell,Addyson Aaliyah,MonsterWorq,465,Client Solution ...
Max,fff1cd7a-04f9-48...,William Taylor,Shena Desiree,Yogatacular,2036,Workshop Technician
Distinct,945,945,6,5,28,15
Min chars in a row,36.0,8.0,11.0,10.0,,11.0
Max chars in a row,36.0,21.0,15.0,13.0,,25.0
Min words in a row,1.0,2.0,2.0,1.0,,1.0
Max words in a row,1.0,2.0,2.0,1.0,,3.0
Char vocabulary,-0123456789abcdef,ABCDEGHJKLMNPRS...,ACDGJPSadefhiln...,AMSTVWYabceghikl...,,-ACDEFJLMOPRSTW...
Word vocabulary,{e776c703-b419-4...,"{Paul, Elizabeth...","{Paul, Silvia, C...","{MonsterWorq, Sp...",{},"{Client, JavaScr..."
Avg,,,,,932.258201,


Is is also possible to just get all stats at once using a print-friendly format of a plain text.

In [None]:
print(data_stats.get_all_statistics_as_string())

Column num = 0
max_num_words = 1
min_num_words = 1
num_chars = 34020
num_uppercase_chars = 0
type = String
isCategorical = 0
num_lowercase_chars = 11108
count = 945
quantile50 = 81aabb56-808c-48a1-b2a3-5d3f2e1a752f
num_entirely_lowercase = 945
num_entirely_uppercase = 0
max_num_chars = 1
num_digit_chars = 19132
distinct = 945
avg_chars = 36.000000
min = 0008f14d-e2a7-4582-bf5e-89ce32b55606
num_words = 945
quantile25 = 4307ef5b-2e00-4316-b04c-debff4edc5c4
max = fff1cd7a-04f9-486c-97de-d5d2c6ddb3cb
quantile75 = c8539dda-ec0e-4c67-a2f4-2d201bb82171
min_num_chars = 1
num_non_letter_chars = 22912
vocab = -0123456789abcdef

Column num = 1
max_num_words = 2
min_num_words = 2
num_chars = 12261
num_uppercase_chars = 1890
type = String
isCategorical = 0
num_lowercase_chars = 9426
count = 945
quantile50 = Kenneth King
num_entirely_lowercase = 0
num_entirely_uppercase = 0
max_num_chars = 2
num_digit_chars = 0
distinct = 945
avg_chars = 12.974603
min = Anthony Campbell
num_words = 1890
quantile25 =

# Conclusion

As you have seen, gathering an insight is a straightforward process using Desbordante.