In [67]:
import matplotlib
import seaborn as sns
from functools import partial

import pandas as pd
import matplotlib.pyplot as plt
import shutil
import csv
import os


matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False

dataset_dir = r'D:\Data\workspace\python\projects\CNDataAuditOutput\sichuan\datasets'
empties_dir = r'D:\Data\workspace\python\projects\CNDataAuditOutput\sichuan\empty_datasets'
anomalies_dir = r'D:\Data\workspace\python\projects\CNDataAuditOutput\sichuan\anomalous_datasets'
catalog_path = r'D:\Data\workspace\python\projects\CNDataAuditOutput\sichuan\dataset_catalog.json'

dtype = {'name': str, 'id': str, 'URL': str, 'owner': str, 'category': 'category',
         'published': 'datetime64[ns]', 'updated': 'datetime64[ns]',
         'frequency': 'category', 'sample_data': object}

def dataset_files():
    return (f for f in os.listdir(dataset_dir) if f.endswith('.csv'))

In [68]:
df = pd.read_json(catalog_path, dtype=dtype)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3777 entries, 0 to 3776
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   name         3777 non-null   object        
 1   id           3777 non-null   object        
 2   URL          3777 non-null   object        
 3   owner        3777 non-null   object        
 4   category     3777 non-null   category      
 5   published    3777 non-null   datetime64[ns]
 6   updated      3777 non-null   datetime64[ns]
 7   frequency    3777 non-null   category      
 8   sample_data  3776 non-null   object        
dtypes: category(2), datetime64[ns](2), object(5)
memory usage: 215.1+ KB
None


In [None]:
df['category'].value_counts() 

In [None]:
df['frequency'].value_counts()

In [None]:
# Take a look at Datasets in files but not in catalog
# also Datasets in catalog but not found in files
# Directory containing datasets
files = [f for f in os.listdir(dataset_dir) if f.endswith('.csv')]
dataset_names_from_files = [os.path.splitext(f)[0] for f in files]
dataset_names_from_catalog = df['name'].tolist()

datasets_not_in_catalog = set(dataset_names_from_files) - set(dataset_names_from_catalog)
datasets_in_catalog_not_found = set(dataset_names_from_catalog) - set(dataset_names_from_files)

print(len(set(dataset_names_from_files)), len(set(dataset_names_from_catalog)))
print(len(datasets_not_in_catalog), len(datasets_in_catalog_not_found))

In [None]:
# Find duplicates in the 'name' column
duplicates = df[df.duplicated('name', keep=False)]  # keep=False marks all duplicates as True

# Display duplicates
print("Duplicate records based on 'name':")
print(duplicates['name'].sort_values())
print(len(duplicates))

In [None]:
def remove_files(names):
  count = 0
  for dataset_name in names:
      file_path = os.path.join(dataset_dir, f"{dataset_name}.csv")
      if os.path.exists(file_path):
          os.remove(file_path)
          count += 1
      else:
          print(f"File not found: {file_path}")
  print(count)

# remove_files(datasets_not_in_catalog)
# remove_files(set(duplicates['name'].tolist()))

In [None]:
def remove_records(names):
  filtered_df = df[~df['name'].isin(names)]
  str_filtered_df = filtered_df.astype(str)
  # last column as type dict
  str_filtered_df['sample_data'] = filtered_df['sample_data']
  print(len(str_filtered_df))
  return str_filtered_df

# filtered = remove_records(datasets_in_catalog_not_found)
filtered = remove_records(set(duplicates['name'].tolist()))
filtered.to_json('updated_catalog.json', orient='records', force_ascii=False)

In [None]:
# Move empty datasets (sample_data = {'null': 'null'}) 
# into a separate directory

def move_empty_datasets():
    os.makedirs(empties_dir, exist_ok=True)
    is_empty_sample = df['sample_data'].apply(lambda x: x == {'null': 'null'})
    
    for index, row in df[is_empty_sample].iterrows():
        file_name = f"{row['name']}.csv"
        source_path = os.path.join(dataset_dir, file_name)
        destination_path = os.path.join(empties_dir, file_name)
        shutil.move(source_path, destination_path)
        print(f"Moved '{file_name}' to {empties_dir}")

In [3]:
print("Unique frequencies in the dataset:", df['frequency'].unique())

Unique frequencies in the dataset: ['每年', '不定期', '实时', '每季度', '每月', '每天', '每半年', '每周']
Categories (8, object): ['不定期', '实时', '每半年', '每周', '每天', '每季度', '每年', '每月']


In [71]:
from datetime import timedelta


def evaluate_timeliness(df):
    def is_timely(dataset, now=None):
        if now is None:
            now = pd.Timestamp.now()

        max_days = frequency_days.get(dataset['frequency'], None)
        if max_days is None:
            return None
        return now - dataset['updated'] <= timedelta(days=max_days)

    frequency_days = {
        '实时': 0,
        '每天': 1,
        '每周': 7,
        '每月': 30,
        '每季度': 90,
        '每半年': 183,
        '每年': 365,
    }
    downloaded = pd.Timestamp('2024-04-30 20:00:00')
    is_timely = partial(is_timely, now=downloaded)
    df['is_timely'] = df.apply(is_timely, axis=1)


def print_timeliness_freq_distribution():
    print("Frequency distribution for timely updates:")
    print(timely_freq_counts)
    print("Frequency distribution for untimely updates:")
    print(untimely_freq_counts)


def visualize_timeliness_distribution(save_path=None):
    def autopct_format(values):
        def my_format(pct):
            total = sum(values)
            val = int(round(pct*total/100.0))
            return '{v:d} ({p:.2f}%)'.format(v=val, p=pct) if pct > 0 else ''
        return my_format

    _timely_freq_counts = timely_freq_counts[timely_freq_counts > 0]
    _untimely_freq_counts = untimely_freq_counts[untimely_freq_counts > 0]

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 9))
    axes[0].pie(_timely_freq_counts, labels=_timely_freq_counts.index,
                autopct=autopct_format(_timely_freq_counts), startangle=140)
    axes[1].pie(_untimely_freq_counts, labels=_untimely_freq_counts.index,
                autopct=autopct_format(_timely_freq_counts), startangle=140)
    axes[0].set_title('及时更新的数据集更新频率分布')
    axes[1].set_title('未及时更新的数据集更新频率分布')
    fig.text(0.5, 0.01, '评估时间：2024-04-30 20:00:00', ha='center', va='bottom', fontsize=10)
    fig.tight_layout()
    if save_path is not None:
        plt.savefig(save_path)
    plt.show()

evaluate_timeliness(df)

timely_df = df[df['is_timely'] == True]
untimely_df = df[df['is_timely'] == False]
timely_freq_counts = timely_df['frequency'].value_counts() 
untimely_freq_counts = untimely_df['frequency'].value_counts()

print_timeliness_freq_distribution()
# visualize_timeliness_distribution(save_path='timeliness_distribution.png')

Frequency distribution for timely updates:
frequency
每年     980
每半年     15
每月       1
不定期      0
实时       0
每周       0
每天       0
每季度      0
Name: count, dtype: int64
Frequency distribution for untimely updates:
frequency
每天     330
实时     246
每月     212
每半年    102
每季度     54
每周      18
每年       1
不定期      0
Name: count, dtype: int64


In [85]:
df['is_timely'].value_counts(dropna=False)

is_timely
None     1818
True      996
False     963
Name: count, dtype: int64

In [65]:
def evaluate_csv_integrity():
    anomalies = {}
    
    for file in dataset_files():
        file_name = os.path.splitext(file)[0]
        file_path = os.path.join(dataset_dir, file)
        anomalies[file_name] = detect_csv_anomalies(file_path)
    
    return pd.DataFrame(list(anomalies.items()), columns=['文件名', '首个异常行标'])

def detect_csv_anomalies(file_path):
    with open(file_path, encoding='gbk', errors='ignore') as file:
        reader = csv.reader(file)
        expected_columns = len(next(reader))
        for row_index, row in enumerate(reader, 1):
            if len(row) != expected_columns:
                return row_index
    return -1

def move_anomalies(destination_dir):
    os.makedirs(destination_dir, exist_ok=True)
    
    for file in dataset_files():
        file_path = os.path.join(dataset_dir, file)
        if detect_csv_anomalies(file_path) != -1:
            source_path = os.path.join(dataset_dir, file)
            destination_path = os.path.join(destination_dir, file)
            shutil.move(source_path, destination_path)
            print(f"Moved '{source_path}' to '{destination_path}'")


anomalies_df = evaluate_csv_integrity()
# move_anomalies(anomalies_dir)

In [72]:
print(anomalies_df['首个异常行标'].value_counts())
anomalies_df[anomalies_df['首个异常行标'] != -1]

首个异常行标
-1    2972
Name: count, dtype: int64


Unnamed: 0,文件名,首个异常行标


In [None]:
dataset_name = '绵阳市_生态环境局_绵阳市信访列表'
dataset_file_path = os.path.join(dataset_dir, f'{dataset_name}.csv')
try:
    temp = pd.read_csv(dataset_file_path, encoding='gbk')
except Exception as e:
    raise e
