# 이력서 전처리 및 정보 추출 모듈

### 라이브러리 임포트

In [1]:
# 라이브러리 임포트 및 기본 설정
import pandas as pd
import numpy as np
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import unicodedata
import json
from collections import Counter
from spacy.matcher import Matcher
import logging

# 로깅 설정 (정보 수준 메시지 출력)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
# NLTK 리소스 다운로드
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


True

In [3]:
# spaCy 모델 로드
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
# 카테고리별 기술 키워드 정의
CATEGORY_KEYWORDS = {
    "Data Science": ["python", "machine learning", "pandas", "numpy", "deep learning", "tensorflow", "pytorch", "html", "matlab"],
    "Web Designing": ["photoshop", "figma", "adobe", "html", "css", "ux", "ui", "javascript", "react"],
    "HR": ["recruiting", "training", "interview", "employee", "onboarding", "talent"],
    "Advocate": ["litigation", "legal", "court", "civil", "criminal", "case"],
    "Health and fitness": ["fitness", "nutrition", "yoga", "wellness", "trainer"],
    "Java Developer": ["java", "spring", "hibernate", "j2ee", "jdbc"],
    "Python Developer": ["python", "django", "flask", "api", "pandas"],
    "DevOps Engineer": ["docker", "kubernetes", "jenkins", "terraform", "aws"],
    "Business Analyst": ["analysis", "report", "requirement", "power bi", "workflow"],
    "Electrical Engineering": ["circuit", "voltage", "control", "plc", "switchgear"],
    "Testing": ["selenium", "junit", "qa", "test automation", "bug tracking"],
    "Blockchain": ["blockchain", "ethereum", "smart contract", "cryptocurrency"]
}

TECH_KEYWORDS = [
    'python', 'java', 'c++', 'c', 'sql', 'mysql', 'mongodb', 'postgresql',
    'machine learning', 'deep learning', 'pytorch', 'tensorflow',
    'html', 'css', 'javascript', 'react', 'angular', 'node.js',
    'flask', 'spring boot', 'git', 'docker', 'aws', 'linux'
]


In [5]:
# Preprocessing 클래스 정의
class Preprocessing:
    def __init__(self):
        self.matcher = Matcher(nlp.vocab)
        self.degree_keywords = ["b.tech", "b.e.", "m.tech", "mca", "bachelor", "master", "ph.d", "diploma"]
        self.setup_matcher()

    def setup_matcher(self):
        """학력 패턴 설정"""
        degree_patterns = [
            [{"LOWER": {"REGEX": r"^(b\.tech|b\.e\.|m\.tech|mca|bachelor|master|ph\.d|diploma)$"}}],
            [{"TEXT": {"IN": [kw.lower() for kw in self.degree_keywords]}}]
        ]
        self.matcher.add("EDUCATION", degree_patterns)

    def clean_text(self, text, remove_numbers=False, remove_dates=True):
        """텍스트 정제: 유니코드 정규화, 특수문자 제거, 불용어 제거, 표제어 추출"""
        try:
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
            text = text.lower()
            text = re.sub(r'[^a-z0-9\s]', ' ', text)
            text = re.sub(r'\s+', ' ', text).strip()
            tokens = word_tokenize(text)
            tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
            if remove_numbers:
                tokens = [w for w in tokens if not w.isdigit()]
            if remove_dates:
                date_words = {'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'}
                tokens = [w for w in tokens if w not in date_words]
            # 연도 및 경험 기간 유지
            tokens = tokens + re.findall(r'\d+\s*(?:year|years)\s*(?:experience)?', text)
            return ' '.join(tokens)
        except Exception as e:
            logging.error(f"Text cleaning failed: {str(e)}")
            return text

    def get_skill_patterns(self, keywords):
        """기술 키워드에 대한 정규식 패턴 생성"""
        patterns = []
        for kw in keywords:
            if all(c.isalnum() or c == '_' for c in kw.replace(' ', '')):
                pattern = r'\b' + re.escape(kw) + r'\b'
            else:
                pattern = re.escape(kw)
            pattern = pattern.replace(r'\ ', r'\s+')
            patterns.append((pattern, kw))
        return patterns

    def extract_skills(self, text, category):
        """기술 키워드 추출: 원본 텍스트 사용"""
        all_keywords = CATEGORY_KEYWORDS.get(category, []) + TECH_KEYWORDS
        all_keywords = list(set(all_keywords))
        patterns = self.get_skill_patterns(all_keywords)
        text_lower = text.lower()
        skills = []
        for pattern, kw in patterns:
            if re.search(pattern, text_lower):
                skills.append(kw)
        logging.info(f"Extracted skills: {skills}")
        return list(set(skills))

    def extract_education(self, text):
        """학력 정보 추출: 학위, 기관, 연도"""
        try:
            doc = nlp(text)
            matches = self.matcher(doc)
            degrees = set(doc[start:end].text.lower() for match_id, start, end in matches)

            # 정규식으로 학위 보조 매칭
            for kw in self.degree_keywords:
                if re.search(r'\b' + re.escape(kw) + r'\b', text.lower(), re.IGNORECASE):
                    degrees.add(kw.lower())
            degrees = sorted(list(degrees))

            education = []
            exclude_keywords = {'education', 'recognition', 'details', 'secondary', 'skill', 'exprience', 'company'}
            institution_patterns = r'(?:institute|university|college|academy|school|of\s+[a-z]+)\b'

            # 전체 텍스트에서 연도 먼저 검색
            year_pattern = r'(?:january|february|march|april|may|june|july|august|september|october|november|december)?\s*\d{4}\s*(?:to|-)\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)?\s*\d{4}'
            year_match = re.search(year_pattern, text.lower(), re.IGNORECASE)
            default_year = year_match.group(0) if year_match else None

            # 문장 병합: 학위 포함 문장 + 다음 문장
            sentences = list(doc.sents)
            for i, sent in enumerate(sentences):
                sent_text = sent.text
                sent_lower = sent_text.lower()
                if any(re.search(r'\b' + re.escape(d) + r'\b', sent_lower, re.IGNORECASE) for d in degrees):
                    combined_text = sent_text
                    if i + 1 < len(sentences):
                        combined_text += ' ' + sentences[i + 1].text
                    combined_lower = combined_text.lower()

                    # 기관명 추출
                    orgs = [ent.text for ent in nlp(combined_text).ents if ent.label_ == "ORG" and not any(kw in ent.text.lower() for kw in exclude_keywords)]
                    if not orgs:
                        inst_match = re.search(institution_patterns, combined_lower, re.IGNORECASE)
                        if inst_match:
                            start_idx = max(0, combined_lower.find(inst_match.group(0)) - 150)
                            end_idx = combined_lower.find(inst_match.group(0)) + 200
                            org_text = combined_text[start_idx:end_idx].split('\r\n')[0].strip()
                            org_text = re.sub(r'\s+', ' ', org_text).strip()
                            if not any(kw in org_text.lower() for kw in exclude_keywords) and len(org_text) > 10:
                                orgs = [org_text]

                    # 연도 추출
                    year = default_year
                    if not year:
                        dates = [ent.text for ent in nlp(combined_text).ents if ent.label_ == "DATE" and re.search(r'\d{4}', ent.text)]
                        year = dates[0] if dates else None

                    if degrees:
                        education.append({
                            "degree": degrees,
                            "institution": orgs[0] if orgs else None,
                            "year": year
                        })
                        break

            # 학위만 발견된 경우
            if not education and degrees:
                education.append({
                    "degree": degrees,
                    "institution": None,
                    "year": default_year
                })

            logging.debug(f"Education extracted: {education}")
            return education[:1]
        except Exception as e:
            logging.error(f"Education extraction failed: {str(e)}")
            return []

    def extract_experience(self, text):
        """경력 정보 추출: 회사, 역할, 기간"""
        try:
            doc = nlp(text)
            experiences = []
            role_keywords = [
                "engineer", "developer", "manager", "analyst", "scientist", "programmer", "specialist",
                "consultant", "designer", "researcher", "architect", "administrator", "technician",
                "operator", "coordinator", "supervisor", "director", "executive", "assistant", "associate",
                "intern", "trainee"
            ]
            exclude_keywords = {
                'education', 'college', 'institute', 'university', 'school', 'secondary', 'exprience',
                'skill', 'details', 'recognition', 'company -', 'academic'
            }
            company_patterns = r'(?:at|with|worked\s+at|employed\s+by)\s+([^,;\r\n]+)'

            for sent in doc.sents:
                sent_text = sent.text.lower()
                if any(kw in sent_text for kw in exclude_keywords):
                    continue

                orgs = [ent.text for ent in sent.ents if ent.label_ == "ORG" and not any(kw in ent.text.lower() for kw in exclude_keywords)]
                if not orgs:
                    company_match = re.search(company_patterns, sent_text, re.IGNORECASE)
                    if company_match:
                        org_text = company_match.group(1).strip()
                        if not any(kw in org_text.lower() for kw in exclude_keywords):
                            orgs = [org_text]

                roles = [token.text for token in sent if token.text.lower() in role_keywords]
                if not roles:
                    # 의존성 파싱으로 역할 추출
                    for token in sent:
                        if token.dep_ in ("nsubj", "dobj") and token.text.lower() in role_keywords:
                            roles.append(token.text)

                dates = [ent.text for ent in sent.ents if ent.label_ == "DATE" and re.search(r'\d{4}', ent.text)]
                if not dates:
                    date_pattern = r'\d{4}(?:\s*(?:to|-)\s*\d{4})?'
                    date_match = re.search(date_pattern, sent_text)
                    if date_match:
                        dates = [date_match.group(0)]

                if orgs and dates:
                    experiences.append({
                        "company": orgs[0],
                        "role": roles[0] if roles else None,
                        "period": dates[0],
                        "description": sent.text.strip()
                    })

            logging.debug(f"Experience extracted: {experiences}")
            return experiences
        except Exception as e:
            logging.error(f"Experience extraction failed: {str(e)}")
            return []

    def process_dataframe(self, df, text_col='Resume', category_col='Category'):
        """데이터프레임 전체 전처리"""
        try:
            df['cleaned_resume'] = df[text_col].apply(lambda x: self.clean_text(x, remove_numbers=False, remove_dates=True))
            df['skills'] = df.apply(lambda row: self.extract_skills(row[text_col], row[category_col]), axis=1)
            docs = list(nlp.pipe(df[text_col]))
            df['education'] = [self.extract_education(doc.text) for doc in docs]
            df['experience'] = [self.extract_experience(doc.text) for doc in docs]
            self.check_class_balance(df, category_col)
            return df
        except Exception as e:
            logging.error(f"Dataframe processing failed: {str(e)}")
            return df

    def check_class_balance(self, df, category_col):
        """클래스 불균형 확인"""
        counts = Counter(df[category_col])
        logging.info("\n카테고리별 샘플 수:")
        for category, count in counts.items():
            logging.info(f"{category}: {count}")
        if max(counts.values()) / min(counts.values()) > 5:
            logging.warning("클래스 불균형 감지. SMOTE 또는 클래스 가중치 적용 고려.")
            # 예: from imblearn.over_sampling import SMOTE
            # smote = SMOTE(random_state=42)
            # X_resampled, y_resampled = smote.fit_resample(X, y)


In [6]:
def main():
    try:
        file_path = './src/UpdatedResumeDataSet.csv'
        df = pd.read_csv(file_path)
        logging.info(f"데이터 크기: {df.shape}")
        logging.info("\n데이터 정보:")
        logging.info(df.info())
        logging.info("\n카테고리 목록:")
        logging.info(df['Category'].unique())

        preprocessor = Preprocessing()
        df_processed = preprocessor.process_dataframe(df)

        sample_idx = 5
        logging.info(f"\n샘플 인덱스: {sample_idx}")
        logging.info(f"원문 Resume 요약: {df_processed['Resume'][sample_idx][:300]} ...")
        logging.info(f"정제된 Resume: {df_processed['cleaned_resume'][sample_idx][:300]} ...")
        logging.info(f"기술 스택: {df_processed['skills'][sample_idx]}")
        logging.info(f"학력 정보: {df_processed['education'][sample_idx]}")
        logging.info(f"경력 정보: {df_processed['experience'][sample_idx]}")

        output_path = './src/ProcessedResumeDataSet.json'
        df_processed.to_json(output_path, orient='records', lines=True)
        logging.info(f"전처리된 데이터 저장 완료: {output_path}")
    except Exception as e:
        logging.error(f"Main function failed: {str(e)}")


In [7]:
file_path = './src/UpdatedResumeDataSet.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Category,Resume,cleaned_resume,skills,education,experience
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,"['machine learning', 'numpy', 'deep learning',...",[],['Education Details \r\n\r\nData Science Assur...
1,Data Science,Education Details \nMay 2013 to May 2017 B.E ...,education detail may 2013 may 2017 b.e uitrgpv...,"['machine learning', 'python']",[],['Education Details \r\nMay 2013 to May 2017 B...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,"['machine learning', 'deep learning', 'python']",['B.Tech'],"['Areas of Interest Deep Learning, Control Sys..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill python sap hana tableau sap hana sql sap...,"['machine learning', 'deep learning', 'python']",['Bachelor'],['Skills â\x80¢ R â\x80¢ Python â\x80¢ SAP HAN...
4,Data Science,"Education Details \n MCA YMCAUST, Faridabad...",education detail mca ymcaust faridabad haryana...,['python'],['MCA'],[]


In [8]:
if __name__ == "__main__":
    main()

2025-05-09 09:16:40,363 - INFO - 데이터 크기: (962, 2)
2025-05-09 09:16:40,364 - INFO - 
데이터 정보:
2025-05-09 09:16:40,374 - INFO - None
2025-05-09 09:16:40,375 - INFO - 
카테고리 목록:
2025-05-09 09:16:40,375 - INFO - ['Data Science' 'HR' 'Advocate' 'Arts' 'Web Designing'
 'Mechanical Engineer' 'Sales' 'Health and fitness' 'Civil Engineer'
 'Java Developer' 'Business Analyst' 'SAP Developer' 'Automation Testing'
 'Electrical Engineering' 'Operations Manager' 'Python Developer'
 'DevOps Engineer' 'Network Security Engineer' 'PMO' 'Database' 'Hadoop'
 'ETL Developer' 'DotNet Developer' 'Blockchain' 'Testing']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  962 non-null    object
 1   Resume    962 non-null    object
dtypes: object(2)
memory usage: 15.2+ KB


2025-05-09 09:16:42,757 - INFO - Extracted skills: ['java', 'deep learning', 'pandas', 'python', 'angular', 'javascript', 'sql', 'git', 'numpy', 'machine learning', 'mysql', 'docker', 'css', 'node.js', 'flask', 'html']
2025-05-09 09:16:42,759 - INFO - Extracted skills: ['python', 'machine learning', 'aws']
2025-05-09 09:16:42,760 - INFO - Extracted skills: ['matlab', 'java', 'deep learning', 'python', 'sql', 'machine learning', 'linux', 'mysql', 'flask']
2025-05-09 09:16:42,762 - INFO - Extracted skills: ['deep learning', 'c', 'python', 'sql', 'machine learning']
2025-05-09 09:16:42,763 - INFO - Extracted skills: ['java', 'c', 'python']
2025-05-09 09:16:42,764 - INFO - Extracted skills: ['matlab', 'c', 'python', 'machine learning', 'html']
2025-05-09 09:16:42,765 - INFO - Extracted skills: ['python', 'machine learning']
2025-05-09 09:16:42,768 - INFO - Extracted skills: ['java', 'deep learning', 'pandas', 'c', 'python', 'sql', 'numpy', 'tensorflow', 'machine learning', 'linux', 'c++', 