# Connect to google account

In [None]:
# 구글 드라이브 연결
from google.colab import drive
drive.mount("/content/data")

In [None]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.

    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions

    Returns:
    - recall: Recall@5 value
    """

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # submission의 예측이 각각 5개인지 확인
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # submission의 예측된 값들에 null값이 있는지 확인
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # 예측값에 중복이 있는지 확인
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # primary_col 즉 resume_seq가 양측에 있는지 확인 후 남김
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [None]:
# 모듈 Import
import random
import numpy as np # 행렬 계산에 사용하는 모듈
import pandas as pd # 데이터 처리와 분석을 위한 모듈
import matplotlib.pyplot as plt # 데이터 시각화를 위한 모듈. 2D, 3D 그릴 때 사용
import seaborn as sns # 데이터 시각화를 위한 모듈. 두 데이터의 관계를 볼때 사용
from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# 딕셔너리를 간편하게 사용하는 것을 도와준다. 모델 class 처럼 사용 가능
# dot을 이용해 객체를 불러 사용. JSON 다룰때 유용.
import easydict
args = easydict.EasyDict()

# path
args.default_path = "/content/data/MyDrive/Playdata/Competitions/ML/Dacon/" # 메인 경로
args.apply_train_path = args.default_path + "apply_train.csv" # train 데이터 경로
args.company_path = args.default_path + "company.csv"
args.recruitment_path = args.default_path + "recruitment.csv"
args.resume_certificate_path = args.default_path + "resume_certificate.csv"
args.resume_education_path = args.default_path + "resume_education.csv"
args.resume_language_path = args.default_path + "resume_language.csv"
args.resume_path = args.default_path + "resume.csv"

args.default_submission_path = args.default_path + "sample_submission.csv" # 예측결과(제출파일) 경로

# 데이터 분석을 위한 변수들
# # 난수 생성 제어 => 같은 코드를 실행해도 동일한 결과를 얻기 위해서 설정
# 데이터 분할 및 모델 초기화 때 유용.
args.random_state = 42
args.results = [] # 결과 저장 리스트

In [None]:
args.submission_path = args.default_path + "result/submission_Model_2.csv" # 결과 저장 파일
args.save_results = args.default_path+"result/model_results_Model_2.json" # 결과 저장 json

# File Load

In [None]:
apply_train_df = pd.read_csv(args.apply_train_path) # apply_train -> DataFrame화
company_df = pd.read_csv(args.company_path) # company -> DataFrame화
recruitment_df = pd.read_csv(args.recruitment_path) # recruitment -> DataFrame화
resume_certificate_df = pd.read_csv(args.resume_certificate_path) # resume_certificate -> DataFreame화
resume_education_df = pd.read_csv(args.resume_education_path) # resume_education -> DataFrame화
resume_language_df = pd.read_csv(args.resume_language_path) # resume_language -> DataFrame화
resume_df = pd.read_csv(args.resume_path) # resume -> DataFrame화

apply_train = apply_train_df.copy()
company = company_df.copy()
recruitment = recruitment_df.copy()
resume_certificate = resume_certificate_df.copy()
resume_education = resume_education_df.copy()
resume_language = resume_language_df.copy()
resume = resume_df.copy()

# EDA

## recruitment_seq 기준 합치기

In [None]:
company = company.sort_values(by = "recruitment_seq")
recruitment = recruitment.sort_values(by = "recruitment_seq")

In [None]:
merged_recruitment = pd.merge(recruitment, company, on='recruitment_seq', how='left')
merged_recruitment.set_index('recruitment_seq', inplace=True)
merged_recruitment

## resume_seq 기준 합치기

In [None]:
resume = resume.sort_values(by='resume_seq')

resume_certificate = resume_certificate.sort_values(by='resume_seq')
resume_certificate = resume_certificate.dropna(subset=['certificate_contents'])
resume_certificate = resume_certificate.groupby('resume_seq')['certificate_contents'].apply(';'.join).reset_index()

resume_education = resume_education.sort_values(by='resume_seq')

resume_language = resume_language.sort_values(by='resume_seq')
resume_language['lang_exam_score'] = resume_language['language'].astype(str) + ';' + resume_language['exam_name'].astype(str) + ';' + resume_language['score'].astype(str)
resume_language = resume_language.drop(['language','exam_name', 'score', 'score'], axis=1)
resume_language = resume_language.dropna(subset=['lang_exam_score'])
resume_language = resume_language.groupby('resume_seq')['lang_exam_score'].apply('&'.join).reset_index()

In [None]:
merged_resume = pd.merge(resume, resume_certificate, on='resume_seq', how='left')
merged_resume = pd.merge(merged_resume, resume_education, on='resume_seq', how='left')
merged_resume = pd.merge(merged_resume, resume_language, on='resume_seq', how='left')
merged_resume.set_index('resume_seq', inplace=True)
merged_resume

In [None]:
apply_train.shape

## merged_recruitment 내용 확인

In [None]:
merged_recruitment.info()

In [None]:
merged_recruitment.describe()

- address : 범주
- career : 다 0?
- education : 범주
- major_task : 범주
- qualifications : 범주
- company_type_seq : 범주
- supply_kind : 수치
- employee : 수치

In [None]:
(merged_recruitment.isnull().sum() / len(merged_recruitment)).sort_values()

- address_seq1 : mode
- address_seq 2 ~ 3 : drop
- text_keyword : drop?
- company_type_seq, supply_kind, employee : mode?
- career_end/start : drop?

### merged_recruitment 결측치 확인 및 처리

In [None]:
# 컬럼제거
drop_col = ["address_seq2", "address_seq3", "text_keyword", "career_end", "career_start"]
merged_recruitment.drop(columns = drop_col, inplace = True)

In [None]:
addr_cnt = merged_recruitment["address_seq1"].value_counts()
addr_cnt

In [None]:
com_type_cnt = merged_recruitment["company_type_seq"].value_counts()
com_type_cnt

In [None]:
supply_kind_cnt = merged_recruitment["supply_kind"].value_counts()
supply_kind_cnt

In [None]:
employee_cnt = merged_recruitment["employee"].value_counts(normalize=True)
employee_cnt

In [None]:
addr_mode = merged_recruitment["address_seq1"].mode().values[0]
company_type_mode = merged_recruitment["company_type_seq"].mode().values[0]
supply_kind_mode = merged_recruitment["supply_kind"].mode().values[0]
employee_median = merged_recruitment["employee"].median()

In [None]:
merged_recruitment["address_seq1"].fillna(addr_mode, inplace = True)
merged_recruitment["company_type_seq"].fillna(company_type_mode, inplace = True)
merged_recruitment["supply_kind"].fillna(supply_kind_mode, inplace = True)
merged_recruitment["employee"].fillna(employee_median, inplace = True)

In [None]:
(merged_recruitment.isnull().sum() / len(merged_recruitment)).sort_values()

## merged_resume 내용 확인

In [None]:
merged_resume.info()

In [None]:
merged_resume.describe()

In [None]:
merged_resume.columns

- degree : 범주
- updated_date : 날짜
- hope_salary : 수치
- last_salary : 수치
- career_month : 수치
- hischool_type_seq : 범주?
- hischool_location_seq : 범주
- univ_type_seq1 : 범주
- univ_type_seq2 : 범주
- univ_transfer : ?
- univ_location :범주
- univ_major_type : 범주
- univ_score : 수치

In [None]:
(merged_resume.isnull().sum() / len(merged_resume)).sort_values()

- job_code_seq3, univ_sub_major, job_code_seq2, lang_exam_score = drop
- text_keyword, career_job_code, certificate_contents, univ_major : 확률 확인

### merged_resume 결측치 확인 및 처리

In [None]:
drop_col = ["job_code_seq3", "univ_sub_major", "job_code_seq2", "lang_exam_score" ]
merged_resume.drop(columns = drop_col, inplace = True)

In [None]:
(merged_resume.isnull().sum() / len(merged_resume)).sort_values()

In [None]:
merged_resume["text_keyword"]

In [None]:
merged_resume["career_job_code"]

In [None]:
merged_resume["certificate_contents"]

In [None]:
merged_resume["univ_major"]

# 학습, 검증 데이터 분리

In [None]:
#학습, 검증 분리
train, val = [], []
apply_train_groupby = apply_train_df.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    val.append([uid, iids[-1]])

In [None]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
pred = apply_train_df.copy()

In [None]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [None]:
train_user_similarity = cosine_similarity(train_user_item_matrix)
train_item_similarity = cosine_similarity(train_user_item_matrix.T)

pred_user_similarity = cosine_similarity(pred_user_item_matrix)
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T)

In [None]:
train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

In [None]:
alpha = 0.98
train_recommendations = []
for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
    applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

    for job in recommended_jobs:
        train_recommendations.append([user, job])

In [None]:
val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [None]:
recall5(val,val_prediction)

# Submission

In [None]:
top_recommendations = pd.DataFrame(pred_recommendations, columns=['resume_seq', 'recruitment_seq'])
top_recommendations.to_csv(args.submission_path, index=False)