Import libraries

In [17]:
!pip install pandas
!pip install chardet
!pip install langdetect
!pip install deep_translator

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import os
import json
import pandas as pd
import chardet
from langdetect import detect
from deep_translator import GoogleTranslator



Detect Encoding

In [19]:
def detect_encoding(file_path):
    """Detect the encoding of a file."""
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(100000))  # Read a sample
    return result['encoding']


Convert JSON to CSV

In [20]:
def convert_json_to_csv(json_file, csv_file):
    """Convert JSON to CSV."""
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.json_normalize(data)
    df.to_csv(csv_file, index=False)



Load Data into Pandas dataframe

In [21]:
def load_data(file_path):
    """Load CSV or TSV data into a pandas DataFrame."""
    encoding = detect_encoding(file_path)
    sep = '\t' if file_path.endswith('.tsv') else ','
    return pd.read_csv(file_path, encoding=encoding, sep=sep)



Check column headers

In [22]:
def check_headers(df):
    """Check if headers exist and identify missing or duplicate headers."""
    print(f"Checking headers")
    if df.columns.str.contains('Unnamed').any():
        print("Warning: Dataset might be missing headers.")
    duplicate_headers = df.columns[df.columns.duplicated()].tolist()
    if duplicate_headers:
        print(f"Duplicate headers found: {duplicate_headers}")


Translate foreign column headers to English

In [23]:
def translate_headers(df):
    """Detect and translate foreign language headers to English."""
    print(f"Translating any foreign language headers")
    translated_headers = []
    for col in df.columns:
        try:
            lang = detect(col)
            if lang != 'en':
                translated_col = GoogleTranslator(source='auto', target='en').translate(col)
                translated_headers.append(translated_col)
            else:
                translated_headers.append(col)
        except:
            translated_headers.append(col)
    df.columns = translated_headers



Analyze structure

In [24]:
def analyze_structure(df):
    """Analyze dataset structure."""
    print(f"Analyzing structure")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("Column data types:")
    print(df.dtypes)



Find common columns across CSVs

In [25]:
def find_common_columns(dfs):
    """Identify common columns across multiple CSV files."""
    print(f"Searching for common columns across CSVs")
    common_cols = set(dfs[0].columns)
    for df in dfs[1:]:
        common_cols.intersection_update(df.columns)
    print(f"Common columns across datasets: {common_cols}")



Check for missing values

In [26]:
def check_missing_values(df):
    """Identify missing values."""
    print(f"Identifying missing values")
    missing_values = df.isnull().sum()
    print("Missing values per column:")
    print(missing_values[missing_values > 0])



Check for duplicate rows

In [27]:
def check_duplicate_rows(df):
    """Identify duplicate rows."""
    print(f"Identifying duplicate rows")
    duplicates = df.duplicated().sum()
    print(f"Duplicate rows: {duplicates}")



Check for invalid characters

In [28]:
def check_invalid_characters(df):
    """Identify invalid characters in text columns."""
    print(f"Identifying invalid characters in text columns")
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.replace(r'[^\w\s]', '', regex=True)



Check for mixed data types

In [29]:
def check_mixed_data_types(df):
    """Identify columns with mixed data types."""
    print(f"Identifying columns with mixed data types")
    for col in df.columns:
        types = df[col].map(type).nunique()
        if types > 1:
            print(f"Column '{col}' has mixed data types.")



Check for outliers

In [30]:
def check_outliers(df):
    """Detect outliers using IQR method."""
    print(f"Detecting outliers")
    for col in df.select_dtypes(include=['number']).columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not outliers.empty:
            print(f"Column '{col}' has {len(outliers)} potential outliers.")


Script Engine

In [31]:
def main(directory):
    """Main function to execute dataset analysis."""
    files = [f for f in os.listdir(directory) if f.endswith(('.csv', '.tsv', '.json'))]
    datasets = []
    
    for file in files:
        file_path = os.path.join(directory, file)
        if file.endswith('.json'):
            csv_file = file_path.replace('.json', '.csv')
            convert_json_to_csv(file_path, csv_file)
            file_path = csv_file
        df = load_data(file_path)
        print(f"Analyzing {file}...")
        check_headers(df)
        translate_headers(df)
        analyze_structure(df)
        check_missing_values(df)
        check_duplicate_rows(df)
        check_mixed_data_types(df)
        check_outliers(df)
        datasets.append(df)
    
    if len(datasets) > 1:
        find_common_columns(datasets)

if __name__ == "__main__":
    main("./Zhaopin/datasets/records")


  return pd.read_csv(file_path, encoding=encoding, sep=sep)


Analyzing zhaopin.csv...
Checking headers
Translating any foreign language headers
Analyzing structure
Rows: 1606289, Columns: 13
Column data types:
BID              object
EMail            object
Name             object
Birthday         object
ID card          object
cell phone       object
Education        object
income           object
Industry Code    object
Working hours    object
Field 15         object
hangye           object
memo             object
dtype: object
Identifying missing values
Missing values per column:
EMail                237
Name               22707
Birthday           21508
ID card           199640
cell phone         70736
Education         239897
income            707454
Industry Code     707385
Working hours     196537
Field 15             218
hangye           1371663
memo              617709
dtype: int64
Identifying duplicate rows
Duplicate rows: 199
Identifying columns with mixed data types
Column 'BID' has mixed data types.
Column 'EMail' has mixed data type