In [3]:
import tarfile
import gzip
import pandas as pd
import os
import yaml
import numpy as np
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 메타데이터 추출 함수
def extract_package_metadata(input_path="../data/package", 
                             output_path="../data/metadata/raw", 
                             output_filename="extracted_metadata.csv"):
    
    # 31개의 컬럼 정의
    columns = ['id', 'name', 'version', 'authors', 'email', 'summary', 'description', 'date',
               'files', 'test_files', 'autorequire', 'executables', 'require_paths',
               'dependencies', 'runtime_dependencies', 'development_dependencies', 'extensions', 'requirements',
               'homepage', 'metadata', 'licenses',
               'platform', 'required_ruby_version', 'required_rubygems_version', 'rubygems_version',
               'extra_rdoc_files', 'rdoc_options', 'specification_version',
               'cert_chain', 'signing_key', 'post_install_message']

    data = []

    for root, dirs, files in os.walk(input_path):
        for file in files:
            if file.endswith(".gem"):
                gem_file_path = os.path.join(root, file)
                try:
                    with tarfile.open(gem_file_path, 'r') as gem_file:
                        # metadata.gz 파일을 추출
                        metadata_gz = next((gem_file.extractfile(member) for member in gem_file.getmembers() if member.name.endswith("metadata.gz")), None)
                        if metadata_gz is None:
                            logging.warning(f"metadata.gz not found in {gem_file_path}")
                            continue

                        # metadata.gz 파일을 gzip 해제
                        with gzip.GzipFile(fileobj=metadata_gz) as f:
                            metadata_content = f.read()

                        # YAML 파싱 전에 특수 태그 제거 (replace를 한 번에 처리)
                        tags_to_remove = ["!ruby/object:Gem::Specification", "!ruby/object:Gem::Version", 
                                          "!ruby/object:Gem::Dependency", "!ruby/object:Gem::Requirement"]
                        metadata_text = metadata_content.decode("utf-8")
                        for tag in tags_to_remove:
                            metadata_text = metadata_text.replace(tag, "")

                        # YAML 파싱
                        metadata = yaml.safe_load(metadata_text)

                        # 각 특성 추출 및 null 값 처리
                        row = []
                        name = metadata.get('name', '')
                        version = metadata.get('version', '')
                        # id 컬럼 생성
                        id_value = f"{name}-{version}" if name and version else np.nan
                        row.append(id_value)  # id 추가
                        
                        for col in columns[1:]:  # 이미 id는 추가했으므로 나머지 컬럼만 처리
                            value = metadata.get(col, np.nan)  # 기본적으로 np.nan으로 처리
                            if isinstance(value, list) and not value:  # 빈 리스트 처리
                                value = np.nan
                            if isinstance(value, str) and (not value.strip() or value in [' ', '']):  # 빈 문자열, 띄어쓰기, 공백 처리
                                value = np.nan
                            if value is None:  # None 값도 np.nan으로 처리
                                value = np.nan
                            row.append(value)

                        # dependencies를 runtime과 development로 분리하여 추출
                        dependencies = metadata.get('dependencies', np.nan)
                        runtime_dependencies = []
                        development_dependencies = []

                        if dependencies is not np.nan:
                            for dep in dependencies:
                                dep_type = dep.get('type', '')
                                if dep_type == ':runtime':
                                    runtime_dependencies.append(dep)
                                elif dep_type == ':development':
                                    development_dependencies.append(dep)

                            if not runtime_dependencies:
                                runtime_dependencies = np.nan
                            if not development_dependencies:
                                development_dependencies = np.nan
                        else:
                            runtime_dependencies = np.nan
                            development_dependencies = np.nan

                        # dependencies를 원본 그대로 유지하고, 분류된 것을 각각 추가
                        row[columns.index('dependencies')] = dependencies
                        row[columns.index('runtime_dependencies')] = runtime_dependencies
                        row[columns.index('development_dependencies')] = development_dependencies

                        data.append(row)

                except Exception as e:
                    logging.error(f"Error processing {gem_file_path}: {e}")

    df = pd.DataFrame(data, columns=columns)

    # 각 요소를 검사하여 None, 빈 문자열, 빈 리스트 등을 np.nan으로 통일
    df = df.applymap(lambda x: np.nan if x in [None, '', {}, []] else x)

    # DataFrame을 CSV 파일로 저장
    os.makedirs(output_path, exist_ok=True)
    df.to_csv(os.path.join(output_path, output_filename), index=False)

    logging.info(f"Data extraction complete. CSV file saved at {os.path.join(output_path, output_filename)}")
    
    # 데이터 추출 완료 메시지 출력
    print("Data extraction complete.")
    
    # 상위 5개 행 출력
    print(df.head())
    
    return df

# 함수 실행 예시
extract_package_metadata()

2024-10-14 22:35:41,240 - ERROR - Error processing ../data/package/neutral/crawling\escape-0.0.4.gem: mapping values are not allowed here
  in "<unicode string>", line 20, column 15:
      requirements: 
                  ^
  df = df.applymap(lambda x: np.nan if x in [None, '', {}, []] else x)
2024-10-14 22:35:48,656 - INFO - Data extraction complete. CSV file saved at ../data/metadata/raw\extracted_metadata.csv


Data extraction complete.
                                   id           name                version  \
0           aasm-{'version': '5.5.0'}           aasm   {'version': '5.5.0'}   
1   acme-client-{'version': '2.0.18'}    acme-client  {'version': '2.0.18'}   
2    actioncable-{'version': '7.2.0'}    actioncable   {'version': '7.2.0'}   
3  actionmailbox-{'version': '7.2.0'}  actionmailbox   {'version': '7.2.0'}   
4   actionmailer-{'version': '7.2.0'}   actionmailer   {'version': '7.2.0'}   

                                       authors  \
0             [Thorsten Boettger, Anil Maurya]   
1                            [Charles Barbier]   
2      [Pratik Naik, David Heinemeier Hansson]   
3  [David Heinemeier Hansson, George Claghorn]   
4                   [David Heinemeier Hansson]   

                                            email  \
0           aasm@mt7.de, anilmaurya8dec@gmail.com   
1                         [unixcharles@gmail.com]   
2  [pratiknaik@gmail.com, david@loudthi

Unnamed: 0,id,name,version,authors,email,summary,description,date,files,test_files,...,platform,required_ruby_version,required_rubygems_version,rubygems_version,extra_rdoc_files,rdoc_options,specification_version,cert_chain,signing_key,post_install_message
0,aasm-{'version': '5.5.0'},aasm,{'version': '5.5.0'},"[Thorsten Boettger, Anil Maurya]","aasm@mt7.de, anilmaurya8dec@gmail.com",State machine mixin for Ruby objects,AASM is a continuation of the acts-as-state-ma...,2023-02-05 00:00:00+00:00,"[LICENSE, README.md, lib/aasm.rb, lib/aasm/aas...",,...,ruby,"{'requirements': [['>=', {'version': '1.9.3'}]]}","{'requirements': [['>=', {'version': '0'}]]}",3.0.9,,,4,,,
1,acme-client-{'version': '2.0.18'},acme-client,{'version': '2.0.18'},[Charles Barbier],[unixcharles@gmail.com],Client for the ACME protocol.,,2024-06-14 00:00:00+00:00,"[CHANGELOG.md, Gemfile, LICENSE.txt, README.md...",,...,ruby,"{'requirements': [['>=', {'version': '2.3.0'}]]}","{'requirements': [['>=', {'version': '0'}]]}",3.4.20,,,4,,,
2,actioncable-{'version': '7.2.0'},actioncable,{'version': '7.2.0'},"[Pratik Naik, David Heinemeier Hansson]","[pratiknaik@gmail.com, david@loudthinking.com]",WebSocket framework for Rails.,Structure many real-time application concerns ...,2024-08-09 00:00:00+00:00,"[CHANGELOG.md, MIT-LICENSE, README.md, app/ass...",,...,ruby,"{'requirements': [['>=', {'version': '3.1.0'}]]}","{'requirements': [['>=', {'version': '0'}]]}",3.5.11,,,4,,,
3,actionmailbox-{'version': '7.2.0'},actionmailbox,{'version': '7.2.0'},"[David Heinemeier Hansson, George Claghorn]","[david@loudthinking.com, george@basecamp.com]",Inbound email handling framework.,Receive and process incoming emails in Rails a...,2024-08-09 00:00:00+00:00,"[CHANGELOG.md, MIT-LICENSE, README.md, app/con...",,...,ruby,"{'requirements': [['>=', {'version': '3.1.0'}]]}","{'requirements': [['>=', {'version': '0'}]]}",3.5.11,,,4,,,
4,actionmailer-{'version': '7.2.0'},actionmailer,{'version': '7.2.0'},[David Heinemeier Hansson],david@loudthinking.com,Email composition and delivery framework (part...,"Email on Rails. Compose, deliver, and test ema...",2024-08-09 00:00:00+00:00,"[CHANGELOG.md, MIT-LICENSE, README.rdoc, lib/a...",,...,ruby,"{'requirements': [['>=', {'version': '3.1.0'}]]}","{'requirements': [['>=', {'version': '0'}]]}",3.5.11,,,4,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134,yajl-ruby-{'version': '1.4.3'},yajl-ruby,{'version': '1.4.3'},"[Brian Lopez, Lloyd Hilaiel]",seniorlopez@gmail.com,Ruby C bindings to the excellent Yajl JSON str...,,2022-05-26 00:00:00+00:00,"[.codeclimate.yml, .github/workflows/ci.yml, ....",,...,ruby,"{'requirements': [['>=', {'version': '2.6.0'}]]}","{'requirements': [['>=', {'version': '0'}]]}",3.3.3,,,4,,,
1135,yard-{'version': '0.9.36'},yard,{'version': '0.9.36'},[Loren Segal],lsegal@soen.ca,Documentation tool for consistent and usable d...,YARD is a documentation generation tool fo...,2024-02-29 00:00:00+00:00,"[.yardopts, .yardopts_guide, .yardopts_i18n, C...",,...,ruby,"{'requirements': [['>=', {'version': '0'}]]}","{'requirements': [['>=', {'version': '0'}]]}",3.3.5,,,4,,,
1136,zeitwerk-{'version': '2.6.17'},zeitwerk,{'version': '2.6.17'},[Xavier Noria],fxn@hashref.com,Efficient and thread-safe constant autoloader,Zeitwerk implements constant autoloading w...,2024-07-29 00:00:00+00:00,"[MIT-LICENSE, README.md, lib/zeitwerk.rb, lib/...",,...,ruby,"{'requirements': [['>=', {'version': '2.5'}]]}","{'requirements': [['>=', {'version': '0'}]]}",3.5.15,,,4,,,
1137,zendesk_api-{'version': '3.1.0'},zendesk_api,{'version': '3.1.0'},"[Steven Davidovitz, Michael Grosser]",[support@zendesk.com],Zendesk REST API Client,Ruby wrapper for the REST API at https://www.z...,2024-07-01 00:00:00+00:00,"[LICENSE, lib/zendesk_api.rb, lib/zendesk_api/...",,...,ruby,"{'requirements': [['>=', {'version': '2.7'}]]}","{'requirements': [['>=', {'version': '1.3.6'}]]}",3.5.11,,,4,,,


In [9]:
import pandas as pd

def check_duplicates(input_path="../data/metadata/raw/", 
                     input_filename="extracted_metadata.csv", 
                     subset_columns=['name', 'version', 'rubygems_version'], 
                     output_path="../data/metadata/raw/", 
                     output_filename="checked_metadata.csv", 
                     duplicates_output_path="../data/metadata/raw/duplicate_metadata.csv"):

    # 데이터 읽기
    df = pd.read_csv(input_path + input_filename)

    # 중복된 행을 찾아 출력 및 DataFrame으로 저장
    duplicates = df[df.duplicated(subset=subset_columns, keep=False)]
    if not duplicates.empty:
        print("중복된 행의 정보:")
        print(duplicates[subset_columns])
        
        # 중복된 행을 별도의 파일로 저장
        duplicates.to_csv(duplicates_output_path, index=False)
        print(f"중복된 메타데이터가 {duplicates_output_path}에 저장되었습니다.")
    else:
        print("중복된 행이 없습니다.")

    # 중복된 행을 제거
    df_checked = df.drop_duplicates(subset=subset_columns, keep='first')

    # 중복 제거 전/후 행 개수 확인
    print(f"중복 제거 전 데이터프레임 크기: {df.shape}")
    print(f"중복 제거 후 데이터프레임 크기: {df_checked.shape}")

    # 중복 제거된 데이터를 새로운 파일로 저장
    df_checked.to_csv(output_path + output_filename, index=False)
    print(f"중복 제거된 데이터가 {output_path + output_filename}에 저장되었습니다.")

# 함수 호출 예시
check_duplicates()

중복된 행의 정보:
                  name                version rubygems_version
178     active-support   {'version': '5.2.0'}         2.6.14.1
326     aloha_analyser   {'version': '0.6.2'}            3.0.3
463          auto-cron  {'version': '0.1.16'}            3.0.3
471        awesome-bot  {'version': '1.18.0'}            3.0.3
527  blockchain_wallet   {'version': '0.0.7'}            3.0.3
..                 ...                    ...              ...
851          ruby_nmap   {'version': '0.9.3'}            3.0.3
854    simple_captcha2   {'version': '0.2.3'}            2.7.3
855           TacoBell   {'version': '0.1.0'}            3.0.3
856          unixCrypt   {'version': '1.3.0'}            3.0.3
857         unix_crypt   {'version': '1.3.0'}            3.0.3

[68 rows x 3 columns]
중복된 메타데이터가 ../data/metadata/raw/duplicate_metadata.csv에 저장되었습니다.
중복 제거 전 데이터프레임 크기: (860, 31)
중복 제거 후 데이터프레임 크기: (826, 31)
중복 제거된 데이터가 ../data/metadata/raw/checked_metadata.csv에 저장되었습니다.
