In [1]:
import tarfile
import gzip
import pandas as pd
import os
import yaml
import numpy as np

# 30개의 컬럼 정의
columns = ['name', 'version', 'authors', 'email', 'summary', 'description', 'date',
           'files', 'test_files', 'autorequire', 'executables', 'require_paths',
           'dependencies', 'runtime_dependencies', 'development_dependencies', 'extensions', 'requirements',
           'homepage', 'metadata', 'licenses',
           'platform', 'required_ruby_version', 'required_rubygems_version', 'rubygems_version',
           'extra_rdoc_files', 'rdoc_options', 'specification_version',
           'cert_chain', 'signing_key', 'post_install_message']

# 메타데이터 추출 함수
def extract_metadata_from_gem_files(root_folder, output_filename):
    data = []

    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".gem"):
                gem_file_path = os.path.join(root, file)
                try:
                    with tarfile.open(gem_file_path, 'r') as gem_file:
                        # metadata.gz 파일을 추출
                        metadata_gz = None
                        for member in gem_file.getmembers():
                            if member.name.endswith("metadata.gz"):
                                metadata_gz = gem_file.extractfile(member)
                                break
                        if metadata_gz is None:
                            print(f"metadata.gz not found in {gem_file_path}")
                            continue

                        # metadata.gz 파일을 gzip 해제
                        with gzip.GzipFile(fileobj=metadata_gz) as f:
                            metadata_content = f.read()

                        # YAML 파싱 전에 특수 태그 제거
                        metadata_text = metadata_content.decode("utf-8")
                        metadata_text = metadata_text.replace("!ruby/object:Gem::Specification", "")
                        metadata_text = metadata_text.replace("!ruby/object:Gem::Version", "")
                        metadata_text = metadata_text.replace("!ruby/object:Gem::Dependency", "")
                        metadata_text = metadata_text.replace("!ruby/object:Gem::Requirement", "")

                        # YAML 파싱
                        metadata = yaml.safe_load(metadata_text)

                        # 각 특성 추출 및 null 값 처리
                        row = []
                        for col in columns:
                            value = metadata.get(col, np.nan)  # 기본적으로 np.nan으로 처리
                            if isinstance(value, list) and not value:  # 빈 리스트 처리
                                value = np.nan
                            if isinstance(value, str) and (not value.strip() or value in [' ', '']):  # 빈 문자열, 띄어쓰기, 공백 처리
                                value = np.nan
                            if value is None:  # None 값도 np.nan으로 처리
                                value = np.nan
                            row.append(value)

                        # dependencies를 runtime과 development로 분리하여 추출
                        dependencies = metadata.get('dependencies', np.nan)
                        runtime_dependencies = []
                        development_dependencies = []

                        if dependencies is not np.nan:
                            for dep in dependencies:
                                dep_type = dep.get('type', '')
                                if dep_type == ':runtime':
                                    runtime_dependencies.append(dep)
                                elif dep_type == ':development':
                                    development_dependencies.append(dep)

                            if not runtime_dependencies:
                                runtime_dependencies = np.nan
                            if not development_dependencies:
                                development_dependencies = np.nan
                        else:
                            runtime_dependencies = np.nan
                            development_dependencies = np.nan

                        # dependencies를 원본 그대로 유지하고, 분류된 것을 각각 추가
                        row[columns.index('dependencies')] = dependencies
                        row[columns.index('runtime_dependencies')] = runtime_dependencies
                        row[columns.index('development_dependencies')] = development_dependencies

                        data.append(row)

                except Exception as e:
                    print(f"Error processing {gem_file_path}: {e}")

    df = pd.DataFrame(data, columns=columns)

    # 각 요소를 검사하여 None, 빈 문자열, 빈 리스트 등을 np.nan으로 통일
    df = df.applymap(lambda x: np.nan if x in [None, '', {}, []] else x)

    # DataFrame을 CSV 파일로 저장
    output_folder = "../data/metadata/raw"  # 상대 경로로 수정
    os.makedirs(output_folder, exist_ok=True)
    df.to_csv(os.path.join(output_folder, output_filename), index=False)

    return df

# 함수 실행
root_folder = "../data/package"  # 상대 경로로 수정
output_filename = "extracted_metadata.csv"  # 저장할 파일 이름을 변수로 지정
df_metadata = extract_metadata_from_gem_files(root_folder, output_filename)

print("Data extraction complete. CSV file saved.")

Error processing ../data/package/neutral\crawling\escape-0.0.4.gem: mapping values are not allowed here
  in "<unicode string>", line 20, column 15:
      requirements: 
                  ^
Data extraction complete. CSV file saved.


  df = df.applymap(lambda x: np.nan if x in [None, '', {}, []] else x)
