In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import re
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
class Config:
    DATA_FILE_PATH = './dataset/arxiv-metadata-oai-snapshot.json'
    OUTPUT_EMBEDDING_PATH = './dataset/arxiv_paper_embeddings.pkl'
    OUTPUT_PROCESSED_DATA_PATH = './dataset/processed_arxiv_data.pkl'
    MODEL_NAME = 'all-mpnet-base-v2'
    SAMPLE_SIZE_PER_YEAR = 1000
    TIME_PERIODS = [(1986, 1992),  ] 
    # 1993년부터 2025년까지 1년 단위로 추가
    for year in range(1993, 2026):
        TIME_PERIODS.append((year, year))

In [3]:
def extract_year_from_versions(versions):
    """versions 필드에서 실제 제출 날짜 추출"""
    if isinstance(versions, list) and len(versions) > 0:
        first_version = versions[0]
        created_date = first_version.get('created', '')
        match = re.search(r'(\d{4})', created_date)
        if match:
            return int(match.group(1))
    return None

def extract_year_fallback(arxiv_id):
    """ID 기반 연도 추출 (폴백용)"""
    if isinstance(arxiv_id, str) and '.' in arxiv_id:
        year_prefix = arxiv_id.split('.')[0][:2]
        try:
            year_int = int(year_prefix)
            if 90 <= year_int <= 99:
                return 1900 + year_int
            elif 0 <= year_int <= 25:
                return 2000 + year_int
        except ValueError:
            return None
    return None

def load_and_sample_data(file_path: str, sample_size_per_year: int) -> pd.DataFrame:
    print(f"> Loading data from {file_path}...")
    df = pd.read_json(file_path, lines=True)
    print(f"> Total initial records: {len(df)}")
    
    print("> Extracting years from versions...")
    years = []
    for version in tqdm(df['versions'], desc="Extracting years"):
        years.append(extract_year_from_versions(version))
    
    df['year'] = years
    
    if df['year'].isna().sum() > 0:
        print("> Some years missing from versions, falling back to ID-based extraction...")
        mask = df['year'].isna()
        
        fallback_years = []
        for arxiv_id in tqdm(df.loc[mask, 'id'], desc="Fallback year extraction"):
            fallback_years.append(extract_year_fallback(arxiv_id))
        
        df.loc[mask, 'year'] = fallback_years
    
    df.dropna(subset=['year'], inplace=True)
    df['year'] = df['year'].astype(int)
    
    print(f"> Records with valid year: {len(df)}")
    print(f"> Year range: {df['year'].min()} - {df['year'].max()}")
    
    sampled_dfs = []
    for year, group in df.groupby('year'):
        sample_size = min(sample_size_per_year, len(group))
        sample_df = group.sample(n=sample_size, random_state=42)
        sampled_dfs.append(sample_df)
    
    df_sampled = pd.concat(sampled_dfs).reset_index(drop=True)
    
    # 시기 정보 추가
    def assign_time_period(year):
        for start, end in Config.TIME_PERIODS:
            if start <= year <= end:
                return f"{start}-{end}"
        return "other"
    
    df_sampled['time_period'] = df_sampled['year'].apply(assign_time_period)
    
    print(f"> Total sampled records: {len(df_sampled)}")
    print(f"> Time period distribution:\n{df_sampled['time_period'].value_counts().sort_index()}")
    
    return df_sampled

def enhanced_preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    
    # LaTeX 명령어 제거
    text = re.sub(r'\\(?:[a-zA-Z]+|[{}])', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def prepare_topic_labeling_data(df):
    print("> Preparing topic labeling data with TF-IDF...")
    
    # TF-IDF 벡터라이저 설정
    vectorizer = TfidfVectorizer(
        max_features=150, 
        stop_words='english',
        ngram_range=(1, 3)
    )
    
    # TF-IDF 행렬 생성
    tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
    feature_names = vectorizer.get_feature_names_out()
    
    print(f"> Extracted {len(feature_names)} features")
    
    # 각 문서별 상위 키워드 추출 (tqdm 수정)
    top_keywords = []
    for i in tqdm(range(len(df)), desc="Extracting keywords"):
        tfidf_scores = tfidf_matrix[i].toarray().flatten()
        top_indices = tfidf_scores.argsort()[-5:][::-1]  # 상위 5개 키워드
        keywords = [feature_names[idx] for idx in top_indices if tfidf_scores[idx] > 0]
        top_keywords.append(keywords)
    
    df['top_keywords'] = top_keywords
    df['top_keywords_str'] = df['top_keywords'].apply(lambda x: ', '.join(x))
    
    # TF-IDF 정보 저장
    tfidf_info = {
        'vectorizer': vectorizer,
        'feature_names': feature_names,
        'matrix': tfidf_matrix
    }
    
    return df, tfidf_info

In [4]:
def data_preprocessing():
    if not os.path.exists(Config.DATA_FILE_PATH):
        print(f"> Error: Data file not found at {Config.DATA_FILE_PATH}")
        return None, None
    
    try:
        # 1. 데이터 로드 및 샘플링
        df_sampled = load_and_sample_data(
            Config.DATA_FILE_PATH, 
            Config.SAMPLE_SIZE_PER_YEAR
        )
        
        # 2. 텍스트 전처리 
        print(">> Preprocessing text data...")
        df_sampled['title'] = df_sampled['title'].fillna('')
        df_sampled['abstract'] = df_sampled['abstract'].fillna('')
        
        print(">> Extracting primary category...")
        df_sampled['primary_category'] = df_sampled['categories'].apply(
            lambda x: str(x).split()[0] if isinstance(x, str) and str(x).strip() else ''
        )
        combined_texts = []
        
        texts_to_process = (df_sampled['title'] + '. ' + df_sampled['abstract']).fillna('').tolist() # .fillna('') 추가
        
        for text in tqdm(texts_to_process, desc="Preprocessing texts"):
            processed_text = enhanced_preprocess_text(text)
            combined_texts.append(processed_text)
        
        df_sampled['combined_text'] = combined_texts
        
        # 3. 토픽 라벨링 준비
        df_sampled, tfidf_info = prepare_topic_labeling_data(df_sampled)
        
        # 4. 임베딩 생성
        print(">> Generating embeddings...")
        model = SentenceTransformer(Config.MODEL_NAME)
        embeddings = model.encode(
            df_sampled['combined_text'].tolist(),
            show_progress_bar=True,
            convert_to_tensor=True,
            batch_size=32
        )
        embeddings_np = embeddings.cpu().numpy()
        # tqdm을 이용한 수동 진행률 표시 필요(큰 문제는 아님)
        
        # 5. 결과 저장
        pd.DataFrame(embeddings_np).to_pickle(Config.OUTPUT_EMBEDDING_PATH)
        df_sampled.to_pickle(Config.OUTPUT_PROCESSED_DATA_PATH)
        
        print(f"\n--- Processing Complete ---")
        print(f"> Embeddings shape: {embeddings_np.shape}")
        print(f"> Sample data with keywords:")
        print(df_sampled[['title', 'year', 'top_keywords_str']].head(3))
        
        return df_sampled, embeddings_np
        
    except Exception as e:
        print(f"> Error during processing: {e}")
        import traceback
        traceback.print_exc()
        return None, None

In [5]:
if __name__ == "__main__":
    df, embeddings = data_preprocessing()

> Loading data from ./dataset/arxiv-metadata-oai-snapshot.json...
> Total initial records: 2860945
> Extracting years from versions...


Extracting years: 100%|██████████| 2860945/2860945 [00:05<00:00, 513396.29it/s]


> Records with valid year: 2860945
> Year range: 1986 - 2025
> Total sampled records: 34387
> Time period distribution:
time_period
1986-1992    1387
1993-1993    1000
1994-1994    1000
1995-1995    1000
1996-1996    1000
1997-1997    1000
1998-1998    1000
1999-1999    1000
2000-2000    1000
2001-2001    1000
2002-2002    1000
2003-2003    1000
2004-2004    1000
2005-2005    1000
2006-2006    1000
2007-2007    1000
2008-2008    1000
2009-2009    1000
2010-2010    1000
2011-2011    1000
2012-2012    1000
2013-2013    1000
2014-2014    1000
2015-2015    1000
2016-2016    1000
2017-2017    1000
2018-2018    1000
2019-2019    1000
2020-2020    1000
2021-2021    1000
2022-2022    1000
2023-2023    1000
2024-2024    1000
2025-2025    1000
Name: count, dtype: int64
>> Preprocessing text data...
>> Extracting primary category...


Preprocessing texts: 100%|██████████| 34387/34387 [00:00<00:00, 49432.61it/s]


> Preparing topic labeling data with TF-IDF...
> Extracted 150 features


Extracting keywords: 100%|██████████| 34387/34387 [00:00<00:00, 56108.72it/s]


>> Generating embeddings...


Batches:   0%|          | 0/1075 [00:00<?, ?it/s]


--- Processing Complete ---
> Embeddings shape: (34387, 768)
> Sample data with keywords:
                              title  year                   top_keywords_str
0  Desperately Seeking Superstrings  1986          provide, analysis, theory
1    Applied Conformal Field Theory  1988  free, theory, field, critical, 10
2      Convex bodies with few faces  1989                     number, theory
