# DAO Community Git Hosting Platform Survey Report Data Generator using Python-Polars in Google Environment
---
## Base Data Generator using Polars

![](https://img.shields.io/badge/Version%201.0.0-333333?style=for-the-badge)![](https://img.shields.io/badge/Made%20with-808080?style=for-the-badge)[![](https://img.shields.io/badge/Google%20Colaboratory-4d4d4d?style=for-the-badge&logo=googlecolab)](https://docs.jupyter.org/en/latest/)![](https://img.shields.io/badge/And-808080?style=for-the-badge)[![](https://img.shields.io/badge/Python%203.10.12-306998?style=for-the-badge&logo=Python&logoColor=FFD43B)](https://docs.python.org/3.10/)[![](https://img.shields.io/badge/Polars%200.17.3-FFD43B?style=for-the-badge&logo=Polars&logoColor=306998)](https://docs.python.org/3.11/)

![](https://img.shields.io/badge/Repo-808080?style=for-the-badge)[![](https://img.shields.io/badge/GitHub-6E5494?style=for-the-badge&logo=GitHub)](https://github.com/amlc_ai_dev/amlc_ai)

| Library | Purpose | Version | Links |
| :--- | :--- | ---: | :--- |
| **Polars** | DataFrame | 0.17.3 | [GitHub](https://github.com/pola-rs/polars) [Site](https://www.pola.rs/) [User Guide](https://pola-rs.github.io/polars-book/user-guide/) [Polars API Reference](https://pola-rs.github.io/polars/py-polars/html/reference/index.html) |
| **Faker** | Masking/Dummy Data | 19.4.0 | [GitHub](https://github.com/joke2k/faker) [Docs](https://faker.readthedocs.io/en/master/) |

In [None]:
from __future__ import annotations

# Mount Drive

In [None]:
from pathlib import Path

from google.colab import drive

mount_point: Path = Path("/gdrive")

drive.mount(mountpoint=str(mount_point.resolve()), force_remount=True)

Mounted at /gdrive


In [None]:
base_path: Path = (
    mount_point
    / "MyDrive"
    / "Survey"
    / "DAO Community Git Hosting Platform Survey - Google Environment"
)
base_path.mkdir(parents=False, exist_ok=True)

output_data_path: Path = base_path / "Data"
output_data_path.mkdir(parents=False, exist_ok=True)

In [None]:
## NOTE: import-ipynb cannot work with notebooks in Google Drive, thus the workaround below.
type_objects_module = base_path / "Generator" / "Type Objects Polars.ipynb"

if type_objects_module.exists():
    type_objects_module: str = f"{type_objects_module}"
    %run -n "$type_objects_module"
    """Creates the ff:
        Eyears_of_experience,
        Egit_hosting_platform,
        Ecareer_level,
        Edao_pillar,
        Epast_next,
        Epast_next_all,
        TDcolumns,
        df_columns_dtypes_dict
    """
else:
    print(f"Module '{type_objects_module}' does not exist.")

In [None]:
%%capture --no-stderr
!pip show faker 1>/dev/null; \
[ $? != 0 ] && { pip install faker; };

In [None]:
from google.colab import auth

auth.authenticate_user()

import gspread
from google.auth import default

creds: auth.compute_engine.credentials.Credentials
_: str
creds, _ = default()

gc: gspread.client.Client = gspread.authorize(creds)

worksheet: gspread.models.Worksheet = gc.open(
    "Git Hosting Platform Survey (Responses)"
).sheet1

## get_all_values gives a list of rows.
rows: list = worksheet.get_all_values()

---
---

# Hierarchy of Categories:

**DAO Pillar** > **Years of Experience** > **Career Level**

---
---

In [None]:
import polars as pl

## Convert to a Polars DataFrame and render.
from faker import Faker

faker: Faker = Faker()

df: pl.dataframe.frame.DataFrame = (
    (
        pl.from_records(
            data=rows,
            schema=df_columns_dtypes_dict,
            orient="row",
        )
        .drop("timestamp", "email_address")
        .slice(
            1,
        )
    )
    .with_columns(
        [
            pl.when(pl.col(pl.Utf8).str.lengths() == 0)
            .then(None)
            .otherwise(pl.col(pl.Utf8))
            .keep_name()
        ]
    )
    .with_columns(
        pl.Series("alias", [faker.unique.first_name() for _ in range(len(rows) - 1)])
    )
)

df.head()

used_git_hosting_platform,current_git_hosting_platform,years_of_experience,past_next_github,past_next_gitlab,career_level,dao_pillar,alias
str,str,str,str,str,str,str,str
"""GitHub, GitLab…","""GitHub, GitLab…","""4 to 6 years""","""Worked with in…","""Worked with in…","""2""","""Data Admin""","""Jeremy"""
"""GitHub, GitLab…","""GitHub, GitLab…","""2 to 4 years""","""Worked with in…","""Worked with in…","""2""","""Data Engineeri…","""Paul"""
,,,"""Want to work w…","""Want to work w…","""2""","""Data Science &…","""Victoria"""
"""GitLab""",,"""<2 years""","""Want to work w…","""Worked with in…","""2""","""Data Engineeri…","""Benjamin"""
"""GitHub""","""GitHub""","""2 to 4 years""","""Worked with in…","""Want to work w…","""1""","""Data Engineeri…","""David"""


This dataframe is used in the Bar Chart Diagrams and Survey Diagram.

## Data Validation
Validate source data and verify notebook compatibility.

In [None]:
from typing import Any, List


def source_data_validation(column_name: str, expected_values_list: List[Any]) -> bool:
    """Checks if all values exists in the expected list.

    Args:
        column_name
        expected_values_list

    Returns:
        True if all values passes condition

    Raises:
        None
    """

    condition: pl.expr.expr.Expr = (pl.col(column_name)).is_in(expected_values_list)

    return df.select([condition]).to_series().all()


tmp_list: dict[str, bool] = {}

tmp_list.update(
    {
        "years_of_experience": source_data_validation(
            column_name="years_of_experience",
            expected_values_list=[_.value for _ in Eyears_of_experience] + [pl.Null],
        )
    }
)  ## optional, accepts Null

tmp_list.update(
    {
        "past_next_github": source_data_validation(
            column_name="past_next_github",
            expected_values_list=[_.value for _ in Epast_next_all] + [pl.Null],
        )
    }
)  ## optional, accepts Null

tmp_list.update(
    {
        "past_next_gitlab": source_data_validation(
            column_name="past_next_gitlab",
            expected_values_list=[_.value for _ in Epast_next_all] + [pl.Null],
        )
    }
)

tmp_list.update(
    {
        "career_level": source_data_validation(
            column_name="career_level",
            expected_values_list=[_.value for _ in Ecareer_level],
        )
    }
)  ## required

tmp_list.update(
    {
        "dao_pillar": source_data_validation(
            column_name="dao_pillar",
            expected_values_list=[_.value for _ in Edao_pillar],
        )
    }
)  ## required

compatibility_check_message: str = "Notebook and Source compatibility check: "

if all(tmp_list) is True:
    print(compatibility_check_message + "Passed")
    # print("Passed columns:", [_[0] for _ in tmp_list.items() if _[1]])
else:
    print(compatibility_check_message + "Failed")
    print(
        "Do not proceed.\n"
        + "Review source data for changes and align notebook code.\n"
        + "Finally, submit a PR (GitHub) or MR (GitLab) for notebook versioning."
    )
    print("Failed columns:", [_[0] for _ in tmp_list.items() if not _[1]])

Notebook and Source compatibility check: Passed


`NOTE`:
<hr>

Do not commit data with personally identifiable information (PII). <br>
Never work directly with the Google Sheets data. <br>
Always fake/dummify/mask the data prior to any manipulation.

In [None]:
df.write_parquet(file=output_data_path / "base_data.parquet", use_pyarrow=True)

---
---

# Other Answers

In [None]:
from typing import Optional, Set

# Count other tools for future use.


def get_other_answers(
    df: pl.dataframe.frame.DataFrame,
    column_name: str,
    reference_list: List[str],
    *args,
    **kwargs,
) -> Optional[pl.dataframe.frame.DataFrame]:
    """Prints a list of answers given in the open form.

    Args:
        column_name: column name string
        reference_list: list of items to ignore
        debug: if True, then performs a dry_run, saving to file disabled
        verbose: show process

    Returns:
        None

    Raises:
        None
    """

    other_answer_list: List[str] = []

    # answer_list: List[str] = [_.value for _ in Egit_hosting_platform] + ["None"]

    for _ in df.select(column_name).rows():
        answer_tmp_str: str = str([a for a in _][0])
        answer_tmp_list: Set[str] = set(answer_tmp_str.split(", "))

        for a in answer_tmp_list:
            if a not in reference_list:
                other_answer_list.append(a.lower())

    other_answer_str: str = ", ".join(sorted(set(other_answer_list)))
    print(f"Other Answer(s) in {column_name}: {other_answer_str}")


## --- Debug ---

In [None]:
get_other_answers(
    df=df,
    column_name="used_git_hosting_platform",
    reference_list=([_.value for _ in Egit_hosting_platform] + ["None"]),
)

get_other_answers(
    df=df,
    column_name="current_git_hosting_platform",
    reference_list=([_.value for _ in Egit_hosting_platform] + ["None"]),
)

Other Answer(s) in used_git_hosting_platform: aws codecommit, bitbucker, bitbucket, codeberg, git only 
Other Answer(s) in current_git_hosting_platform: aws codecommit, bitbucket, codeberg


EOF