Imports

In [1]:
from pathlib import Path
import pandas
import yaml
from pprint import pprint
import great_expectations as gx

Constants

In [2]:
# Filesystem paths
PARENT_FOLDER = Path.cwd()
CONFIG_FOLDER = (PARENT_FOLDER / ".." / ".." / "config").resolve()
DATA_FOLDER = (PARENT_FOLDER / ".." / ".." / "data").resolve()
MODELS_FOLDER = (PARENT_FOLDER / ".." / ".." / "models").resolve()
TEMP_FOLDER = (PARENT_FOLDER / ".." / ".." / "temp").resolve()

Dataset

In [3]:
data = pandas.read_csv((DATA_FOLDER / "Android_Malware_cleaned.csv"), delimiter=",", dtype={
    "Flow ID": str,
    "Source IP": str,
    "Source Port": int,
    "Destination IP": str,
    "Destination Port": int,
    "Protocol": float,
    "Timestamp": str,
    "Flow Duration": float,
    "Total Fwd Packets": float,
    "Total Backward Packets": float,
    "Total Length of Fwd Packets": float,
    "Total Length of Bwd Packets": float,
    "Fwd Packet Length Max": float,
    "Fwd Packet Length Min": float,
    "Fwd Packet Length Mean": float,
    "Fwd Packet Length Std": float,
    "Bwd Packet Length Max": float,
    "Bwd Packet Length Min": float,
    "Bwd Packet Length Mean": float,
    "Bwd Packet Length Std": float,
    "Flow Bytes/s": float,
    "Flow Packets/s": float,
    "Flow IAT Mean": float,
    "Flow IAT Std": float,
    "Flow IAT Max": float,
    "Flow IAT Min": float,
    "Fwd IAT Total": float,
    "Fwd IAT Mean": float,
    "Fwd IAT Std": float,
    "Fwd IAT Max": float,
    "Fwd IAT Min": float,
    "Bwd IAT Total": float,
    "Bwd IAT Mean": float,
    "Bwd IAT Std": float,
    "Bwd IAT Max": float,
    "Bwd IAT Min": float,
    "Fwd PSH Flags": float,
    "Bwd PSH Flags": float,
    "Fwd URG Flags": float,
    "Bwd URG Flags": float,
    "Fwd Header Length": float,
    "Bwd Header Length": float,
    "Fwd Packets/s": float,
    "Bwd Packets/s": float,
    "Min Packet Length": float,
    "Max Packet Length": float,
    "Packet Length Mean": float,
    "Packet Length Std": float,
    "Packet Length Variance": float,
    "FIN Flag Count": float,
    "SYN Flag Count": float,
    "RST Flag Count": float,
    "PSH Flag Count": float,
    "ACK Flag Count": float,
    "URG Flag Count": float,
    "CWE Flag Count": float,
    "ECE Flag Count": float,
    "Down/Up Ratio": float,
    "Average Packet Size": float,
    "Avg Fwd Segment Size": float,
    "Avg Bwd Segment Size": float,
    "Fwd Header Length.1": float,
    "Fwd Avg Bytes/Bulk": float,
    "Fwd Avg Packets/Bulk": float,
    "Fwd Avg Bulk Rate": float,
    "Bwd Avg Bytes/Bulk": float,
    "Bwd Avg Packets/Bulk": float,
    "Bwd Avg Bulk Rate": float,
    "Subflow Fwd Packets": float,
    "Subflow Fwd Bytes": float,
    "Subflow Bwd Packets": float,
    "Subflow Bwd Bytes": float,
    "Init_Win_bytes_forward": float,
    "Init_Win_bytes_backward": float,
    "act_data_pkt_fwd": float,
    "min_seg_size_forward": float,
    "Active Mean": float,
    "Active Std": float,
    "Active Max": float,
    "Active Min": float,
    "Idle Mean": float,
    "Idle Std": float,
    "Idle Max": float,
    "Idle Min": float,
    "Label": str
})
data.columns = data.columns.str.strip()
print(f"{data.shape[0]} rows, {data.shape[1]} columns")
data.head(n=5)

352741 rows, 85 columns


Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,172.217.6.202-10.42.0.211-443-50004-6,10.42.0.211,50004,172.217.6.202,443,6.0,13/06/2017 11:52:39,37027.0,1.0,1.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware
1,172.217.6.202-10.42.0.211-443-35455-6,10.42.0.211,35455,172.217.6.202,443,6.0,13/06/2017 11:52:39,36653.0,1.0,1.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware
2,131.253.61.68-10.42.0.211-443-51775-6,10.42.0.211,51775,131.253.61.68,443,6.0,13/06/2017 11:52:42,534099.0,8.0,12.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware
3,131.253.61.68-10.42.0.211-443-51775-6,10.42.0.211,51775,131.253.61.68,443,6.0,13/06/2017 11:52:43,9309.0,3.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware
4,131.253.61.68-10.42.0.211-443-51776-6,10.42.0.211,51776,131.253.61.68,443,6.0,13/06/2017 11:52:42,19890496.0,8.0,6.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware


In [15]:
gx_context = gx.get_context(context_root_dir=str(CONFIG_FOLDER / "great_expectations"))
gx_context

{
  "anonymous_usage_statistics": {
    "explicit_id": true,
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "enabled": true,
    "explicit_url": false,
    "data_context_id": "466fc709-08f7-404c-b9bf-a35c3fefc724"
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_variables_file_path": "uncommitted/config_variables.yml",
  "config_version": 3.0,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "uncommitted/data_docs/local_site/"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {},
  "evaluation_parameter_store_name": "evaluation_parameter_store",
  "expectations_store_name": "expectations_store",
  "include_rendered_content": {
    "globally": false,
    "expectation_validatio

Reading Great Expectation yaml

In [18]:
datasource_config = {
    "name": "Android_Malware",
    "class_name": "Datasource",
    "execution_engine": {
        "class_name": "PandasExecutionEngine",
        "module_name": "great_expectations.execution_engine"
    },
    "data_connectors": {
        "csv_data_connector": {
            "class_name": "ConfiguredAssetFilesystemDataConnector",
            "module_name": "great_expectations.datasource.data_connector",
            "base_directory": "/app/data",
            "batch_spec_passthrough": {
                "reader_method": "read_csv",
                "reader_options": {}
            },
            "assets": {
                "android_malware_2023": {
                    "group_names": ["month"],
                    "pattern": "Android_Malware_cleaned_2023_(.*)\\.csv"
                }
            }
        }
    }
}

In [21]:
from ruamel import yaml

gx_context.test_yaml_config(yaml.dump(datasource_config))
gx_context.add_datasource(**datasource_config)

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	csv_data_connector : ConfiguredAssetFilesystemDataConnector

	Available data_asset_names (1 of 1):
		android_malware_2023 (0 of 0): []

	Unmatched data_references (3 of 4):['.ipynb_checkpoints', '.keep', 'Android_Malware.csv']



<great_expectations.datasource.new_datasource.Datasource at 0x7f6329b5cb80>

In [23]:
batch = gx.BatchRequest(
    datasource_name="Android_Malware",
    data_connector_name="csv_data_connector",
)

AttributeError: module 'great_expectations' has no attribute 'BatchRequest'

In [12]:
datasource = gx_context.datasources.add_pandas_filesystem(name="Android_Malware", base_directory=str(DATA_FOLDER))

AttributeError: 'dict' object has no attribute 'add_pandas_filesystem'

In [16]:
gx_context.datasources

{}

In [11]:
# gx_validator = gx_context.sources.pandas_default.read_csv((DATA_FOLDER / "Android_Malware_cleaned.csv"))

In [None]:
with (CONFIG_FOLDER / "Android_Malware_cleaned_expectations.yaml").open(mode="r") as file:
    try:
        great_expectation_config = yaml.safe_load(file)
    except yaml.YAMLError as exc:
        print(exc)

pprint(great_expectation_config["datasources"], indent=0)