# Connect to google account

In [149]:
# 구글 드라이브 연결
from google.colab import drive
drive.mount("/content/data")

Drive already mounted at /content/data; to attempt to forcibly remount, call drive.mount("/content/data", force_remount=True).


In [150]:
!pip install category_encoders



In [151]:
# 모듈 Import
import random
import numpy as np # 행렬 계산에 사용하는 모듈
import pandas as pd # 데이터 처리와 분석을 위한 모듈
import matplotlib.pyplot as plt # 데이터 시각화를 위한 모듈. 2D, 3D 그릴 때 사용
import seaborn as sns # 데이터 시각화를 위한 모듈. 두 데이터의 관계를 볼때 사용

import category_encoders as ce

from datetime import datetime, timedelta
from tqdm.auto import tqdm

import warnings
# FutureWarning 경고를 무시하도록 설정
warnings.simplefilter(action='ignore', category=FutureWarning)

In [152]:
# 딕셔너리를 간편하게 사용하는 것을 도와준다. 모델 class 처럼 사용 가능
# dot을 이용해 객체를 불러 사용. JSON 다룰때 유용.
import easydict
args = easydict.EasyDict()

# path
args.default_path = "/content/data/MyDrive/플레이데이터 데이터엔지니어링 28기/Dacon/KMU/data/" # 메인 경로
args.apply_train_path = args.default_path + "apply_train.csv" # train 데이터 경로
args.company_path = args.default_path + "company.csv"
args.recruitment_path = args.default_path + "recruitment.csv"
args.resume_certificate_path = args.default_path + "resume_certificate.csv"
args.resume_education_path = args.default_path + "resume_education.csv"
args.resume_language_path = args.default_path + "resume_language.csv"
args.resume_path = args.default_path + "resume.csv"

args.default_submission_path = args.default_path + "sample_submission.csv" # 예측결과(제출파일) 경로

# 데이터 분석을 위한 변수들
# # 난수 생성 제어 => 같은 코드를 실행해도 동일한 결과를 얻기 위해서 설정
# 데이터 분할 및 모델 초기화 때 유용.
args.random_state = 42
args.results = [] # 결과 저장 리스트

In [153]:
args.submission_path = args.default_path + "result/submission_20231108.csv" # 결과 저장 파일
args.save_results = args.default_path+"result/model_results_20231108.json" # 결과 저장 json

In [154]:
def recall5(answer_df, submission_df):

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # submission의 예측이 각각 5개인지 확인
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # submission의 예측된 값들에 null값이 있는지 확인
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # 예측값에 중복이 있는지 확인
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    # primary_col 즉 resume_seq가 양측에 있는지 확인 후 남김
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    #
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    #recall = np.mean(individual_recalls)
    recall = max(individual_recalls)
    return recall

# File Load

In [155]:
apply_train_df = pd.read_csv(args.apply_train_path) # apply_train -> DataFrame화
company_df = pd.read_csv(args.company_path) # company -> DataFrame화
recruitment_df = pd.read_csv(args.recruitment_path) # recruitment -> DataFrame화
resume_certificate_df = pd.read_csv(args.resume_certificate_path) # resume_certificate -> DataFreame화
resume_education_df = pd.read_csv(args.resume_education_path) # resume_education -> DataFrame화
resume_language_df = pd.read_csv(args.resume_language_path) # resume_language -> DataFrame화
resume_df = pd.read_csv(args.resume_path) # resume -> DataFrame화
submission_df = pd.read_csv(args.default_submission_path)

In [156]:
apply_train = apply_train_df.copy()
company = company_df.copy()
recruitment = recruitment_df.copy()
resume_certificate = resume_certificate_df.copy()
resume_education = resume_education_df.copy()
resume_language = resume_language_df.copy()
resume = resume_df.copy()


### merged_recruitment 생성

In [157]:
# recruitment, company 정렬(recruitment_seq 기준)
company = company.sort_values(by = "recruitment_seq")
recruitment = recruitment.sort_values(by = "recruitment_seq")

In [158]:
# merged_recruitment = recruitment, company merge
merged_recruitment = pd.merge(recruitment, company, on='recruitment_seq', how='left')
merged_recruitment

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,text_keyword,company_type_seq,supply_kind,employee
0,R00001,5.0,,,0,0,2101;2108;2201;2204;2205;2707;2810,2,2,1,,5.0,201.0,631.0
1,R00002,3.0,,,0,0,2507;2703;2707,3,2,1,,2.0,201.0,160.0
2,R00003,3.0,,,0,0,2101;2108;2201;2707,3,2,2,,,,
3,R00004,3.0,,,0,0,2507;2707,3,2,1,,2.0,402.0,500.0
4,R00005,3.0,,,0,0,2507;2707,3,2,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,R06691,3.0,,,0,0,2501;2507;2707,3,2,1,,,,
6691,R06692,3.0,,,0,0,2201;2507,3,2,2,,4.0,402.0,150.0
6692,R06693,5.0,,,0,0,2102;2707,4,2,1,,,,
6693,R06694,3.0,,,0,0,2101;2108;2109;2110;2201;2203;2707,4,2,1,,,,


### merged_resume 생성

In [159]:
# resume 정렬 (resume_seq 기준)
resume = resume.sort_values(by='resume_seq')

In [160]:
# resume_certificate 정렬 (resume_seq 기준)
resume_certificate = resume_certificate.sort_values(by='resume_seq')
# certificate_contents에 결측값이 존재하는 행 제거
resume_certificate = resume_certificate.dropna(subset=['certificate_contents'])
# 이력서번호(resume_seq)기준으로 groupby
# 이력서번호(resume_seq)의 해당하는 자격증들을 세미콜론(;)으로 join해서 적용(apply)
resume_certificate = resume_certificate.groupby('resume_seq')['certificate_contents'].apply(';'.join).reset_index()

In [161]:
# resume_education 정렬 (resume_seq 기준)
resume_education = resume_education.sort_values(by='resume_seq')

In [162]:
# resume_language 정렬 (resume_seq 기준)
resume_language = resume_language.sort_values(by='resume_seq')
resume_language = resume_language.drop_duplicates(subset='resume_seq')

In [163]:
# merged_resume = resume, resume_certificate, resume_education, resume_language
merged_resume = pd.merge(resume, resume_certificate, on='resume_seq', how='left')
merged_resume = pd.merge(merged_resume, resume_education, on='resume_seq', how='left')
merged_resume = pd.merge(merged_resume, resume_language, on='resume_seq', how='left')
merged_resume

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword,job_code_seq1,job_code_seq2,...,univ_type_seq2,univ_transfer,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score,language,exam_name,score
0,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,5,0,17,,,9,20.0,,,
1,U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,...,5,0,3,,,8,90.0,,,
2,U00003,2018-02-14,2020-07-08,4,2004,0.0,0.0,남성복디자이너;TD캐주얼,재료·화학·섬유·의복,,...,5,0,17,,,4,90.0,,,
3,U00004,2017-10-26,2020-04-27,4,2011,0.0,0.0,상품기획;영업기획,재료·화학·섬유·의복,,...,5,0,5,,,3,70.0,2.0,11.0,661.62
4,U00005,2020-03-18,2020-04-08,4,2011,0.0,0.0,인사;총무;경영;MD;상품기획,재료·화학·섬유·의복,,...,0,0,0,,,10,50.0,2.0,4.0,873.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U08478,2020-02-05,2020-03-31,4,2003,0.0,2700.0,상품기획,재료·화학·섬유·의복,,...,5,0,5,,,19,0.0,,,
8478,U08479,2016-02-17,2020-07-15,3,0,2500.0,2300.0,디자이너,재료·화학·섬유·의복,,...,6,0,3,가정과,,9,0.0,,,
8479,U08480,2019-11-13,2020-06-24,4,0,0.0,0.0,VMD;텍스타일디자인;섬유디자인;니트디자인;여성복디자인,디자인,,...,5,0,10,미술대학섬유패션코디네이션학과,,9,80.0,,,
8480,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,...,5,0,5,,,9,60.0,,,


### merged_total 생성

In [164]:
_merged_total = pd.merge(apply_train, merged_recruitment, left_on='recruitment_seq', right_on='recruitment_seq', how='left')
_merged_total = pd.merge(_merged_total, merged_resume, left_on='resume_seq', right_on='resume_seq', how='left')
merged_total = _merged_total
merged_total

Unnamed: 0,resume_seq,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,...,univ_type_seq2,univ_transfer,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score,language,exam_name,score
0,U05833,R03838,3.0,,,0,0,2110;2203;2204;2299,3,2,...,5,0,3,,,3,60.0,,,
1,U06456,R02144,3.0,,,0,0,2204;2507;2707,3,2,...,5,0,10,국어국문학과,,8,70.0,,,
2,U07807,R01877,3.0,,,0,0,2507;2707,3,2,...,5,0,15,,,4,80.0,,,
3,U04842,R02463,3.0,,,0,0,2507;2707,4,2,...,5,0,3,,,9,80.0,,,
4,U08336,R00112,3.0,,,0,0,2507;2707,3,8,...,5,0,17,,,9,70.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57941,U02270,R03430,3.0,,,0,0,2507;2707,3,2,...,10,0,5,패션디자인과종합,,9,0.0,,,
57942,U02640,R04987,3.0,1.0,,0,0,2707;2507,3,2,...,5,0,3,조소학과,,9,70.0,,,
57943,U08238,R01342,3.0,,,0,0,2201;2204;2205;2707,2,2,...,5,0,12,,,3,70.0,,,
57944,U01296,R06363,3.0,,,0,0,2201;2204;2205;2707,3,2,...,5,0,18,의상디자인학,,9,80.0,,,


In [165]:
merged_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57946 entries, 0 to 57945
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   resume_seq             57946 non-null  object 
 1   recruitment_seq        57946 non-null  object 
 2   address_seq1           57940 non-null  float64
 3   address_seq2           877 non-null    float64
 4   address_seq3           73 non-null     float64
 5   career_end             57946 non-null  int64  
 6   career_start           57946 non-null  int64  
 7   check_box_keyword      57946 non-null  object 
 8   education              57946 non-null  int64  
 9   major_task             57946 non-null  int64  
 10  qualifications         57946 non-null  int64  
 11  text_keyword_x         5609 non-null   object 
 12  company_type_seq       20036 non-null  float64
 13  supply_kind            20036 non-null  float64
 14  employee               20036 non-null  float64
 15  re

# New_apply_train 조건 탐색

## 조건 1. Education / Degree

In [166]:
merged_total.shape

(57946, 44)

In [167]:
merged_total.groupby('education')['degree'].value_counts(ascending=False)

education  degree
2          4         12633
           3          1021
           5           525
           2           516
           6            17
3          4         20912
           3          1602
           5           844
           2           727
           6            19
4          4         16914
           3           913
           5           832
           2           445
           6            15
6          4             9
           3             1
           5             1
Name: degree, dtype: int64

In [168]:
merged_total.groupby('degree')['education'].value_counts(ascending=False)

degree  education
2       3              727
        2              516
        4              445
3       3             1602
        2             1021
        4              913
        6                1
4       3            20912
        4            16914
        2            12633
        6                9
5       3              844
        4              832
        2              525
        6                1
6       3               19
        2               17
        4               15
Name: education, dtype: int64

In [169]:
#요구학위보다 최종학위가 높은 경우
merged_total.loc[merged_total['education'] > merged_total['degree']].shape

(2096, 44)

In [170]:
#학위를 두 수준이나 낮춰서 하향지원한 경우
##요구학위 > 최종학위가 높은 경우를 먼저 drop 하기 때문에 요구학위 < 최종학위인 경우만 생각하면 됨
###degree = 6인 지원자는 반드시 하향지원을 할 수 밖에없어서 예외처리
merged_total.loc[
    (merged_total['degree'] != 6)
    &
    (merged_total['degree'] - merged_total['education'] == 2)
    ].shape

(13477, 44)

In [171]:
def drop_degree(df):
    # 'education' 컬럼 값이 'degree' 컬럼 값보다 큰 조건을 만족하는 행 삭제
    df.drop(df[df['education'] > df['degree']].index, inplace=True)
    #df.drop(df[(merged_total['degree'] != 6)
                #&
                #(merged_total['degree'] - merged_total['education'] == 2)
                #].index, inplace=True)
    return df

## 조건 2. text_kwyword 경력 신입 조건

In [172]:
merged_total['text_keyword_x'].value_counts()

기획MD                             225
영업MD                             165
영업관리                             129
영업                               114
VMD                               75
                                ... 
MD;브랜드;캐쥬얼                         3
디자인실;여성복                           3
캐주얼디자이너;디자이너;캐쥬얼디자이너;유니섹스디자이너      2
피팅;디자이너;인턴;여성복                     2
커리어;로드샵;여성복                        1
Name: text_keyword_x, Length: 523, dtype: int64

In [173]:
def calculate_career_level(keyword):
    if pd.isna(keyword):
        return 0
    if '팀장' in keyword:
        return 3
    if '경력' in keyword:
        return 2
    if '인턴' in keyword or '신입' in keyword:
        return 1
    return 0

# 'career_level' 컬럼을 생성
merged_total['career_level'] = merged_total['text_keyword_x'].apply(calculate_career_level)

In [174]:
merged_total['career_level'].value_counts()

0    57522
1      232
2      154
3       38
Name: career_level, dtype: int64

In [175]:
career_or_new = merged_total[['resume_seq','recruitment_seq','career_level','career_month','last_salary']]
career_or_new

Unnamed: 0,resume_seq,recruitment_seq,career_level,career_month,last_salary
0,U05833,R03838,0,175,4300.0
1,U06456,R02144,0,86,3100.0
2,U07807,R01877,0,108,4300.0
3,U04842,R02463,0,4,0.0
4,U08336,R00112,0,0,1500.0
...,...,...,...,...,...
57941,U02270,R03430,0,0,0.0
57942,U02640,R04987,0,27,1900.0
57943,U08238,R01342,0,37,3500.0
57944,U01296,R06363,0,252,7500.0


In [176]:
career_or_new_2 = career_or_new[career_or_new['career_level'] ==1]

In [177]:
career_or_new_2

Unnamed: 0,resume_seq,recruitment_seq,career_level,career_month,last_salary
93,U00528,R03679,1,0,0.0
519,U04821,R01970,1,16,0.0
550,U06029,R05632,1,60,3100.0
580,U01676,R03495,1,0,0.0
639,U03518,R02829,1,61,2700.0
...,...,...,...,...,...
56244,U07956,R05632,1,62,3500.0
56756,U05673,R06126,1,0,0.0
56809,U05674,R03791,1,65,10000.0
57038,U03316,R02613,1,7,1900.0


In [178]:
# 경력

In [179]:
career_or_new_3 = career_or_new[career_or_new['career_level'] ==2]
career_or_new_3

Unnamed: 0,resume_seq,recruitment_seq,career_level,career_month,last_salary
495,U06159,R06619,2,28,0.0
526,U01152,R06572,2,12,0.0
811,U01044,R06006,2,2,2300.0
1247,U07710,R05403,2,228,4500.0
2547,U05782,R06572,2,0,0.0
...,...,...,...,...,...
55469,U02416,R02666,2,120,4900.0
55707,U02986,R01636,2,95,0.0
56904,U08393,R00924,2,96,4300.0
57629,U02194,R00719,2,0,0.0


In [180]:
career_or_new_3 = career_or_new_3[career_or_new_3['career_month'] == career_or_new_3['last_salary']]
career_or_new_3

Unnamed: 0,resume_seq,recruitment_seq,career_level,career_month,last_salary
2547,U05782,R06572,2,0,0.0
5836,U07604,R04658,2,0,0.0
9292,U01207,R06572,2,0,0.0
12584,U07202,R01636,2,0,0.0
22591,U00593,R06572,2,0,0.0
25792,U03821,R06572,2,0,0.0
33495,U07508,R06572,2,0,0.0
36662,U06161,R01636,2,0,0.0
40710,U01012,R01636,2,0,0.0
41427,U06481,R01993,2,0,0.0


In [181]:
career_or_new_4 = career_or_new[career_or_new['career_level'] ==3]

career_or_new_4

# 팀장직을 뽑는 회사에서 지원한 이력서의 경력은 모두다 경력직

Unnamed: 0,resume_seq,recruitment_seq,career_level,career_month,last_salary
694,U05554,R00096,3,171,4700.0
950,U02043,R00193,3,231,5250.0
3946,U07547,R03667,3,240,6750.0
5936,U06667,R00096,3,216,7500.0
6531,U04374,R00096,3,142,5250.0
8012,U03076,R03667,3,147,0.0
10962,U04682,R00096,3,219,5750.0
13034,U07045,R05483,3,190,6250.0
13748,U02515,R05483,3,225,5750.0
15932,U02143,R03477,3,202,6250.0


In [182]:
career_or_new_4 = career_or_new_4[career_or_new_4['career_month'] == career_or_new_4['last_salary']]
career_or_new_4

Unnamed: 0,resume_seq,recruitment_seq,career_level,career_month,last_salary


In [183]:
career_new_drop = merged_total[((merged_total['career_level'] == 2) & (merged_total['career_month'] == merged_total['last_salary']))]

In [184]:
career_new_drop = career_new_drop[['resume_seq','recruitment_seq','career_level','career_month','last_salary']]

In [185]:
career_new_drop

Unnamed: 0,resume_seq,recruitment_seq,career_level,career_month,last_salary
2547,U05782,R06572,2,0,0.0
5836,U07604,R04658,2,0,0.0
9292,U01207,R06572,2,0,0.0
12584,U07202,R01636,2,0,0.0
22591,U00593,R06572,2,0,0.0
25792,U03821,R06572,2,0,0.0
33495,U07508,R06572,2,0,0.0
36662,U06161,R01636,2,0,0.0
40710,U01012,R01636,2,0,0.0
41427,U06481,R01993,2,0,0.0


## 조건 3 자격요건난이도 및 자격증 개수

In [None]:
quali_cerit = merged_total[['resume_seq','recruitment_seq','qualifications','certificate_contents']]
quali_cerit

# New_apply_train 생성 및 검증

In [186]:
new_merged_total = merged_total.copy()

In [187]:
new_merged_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57946 entries, 0 to 57945
Data columns (total 45 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   resume_seq             57946 non-null  object 
 1   recruitment_seq        57946 non-null  object 
 2   address_seq1           57940 non-null  float64
 3   address_seq2           877 non-null    float64
 4   address_seq3           73 non-null     float64
 5   career_end             57946 non-null  int64  
 6   career_start           57946 non-null  int64  
 7   check_box_keyword      57946 non-null  object 
 8   education              57946 non-null  int64  
 9   major_task             57946 non-null  int64  
 10  qualifications         57946 non-null  int64  
 11  text_keyword_x         5609 non-null   object 
 12  company_type_seq       20036 non-null  float64
 13  supply_kind            20036 non-null  float64
 14  employee               20036 non-null  float64
 15  re

조건 1만 적용해서 new_apply_train을 생성한다

In [188]:
new_merged_total = drop_degree(merged_total)

In [189]:
new_merged_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55850 entries, 0 to 57945
Data columns (total 45 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   resume_seq             55850 non-null  object 
 1   recruitment_seq        55850 non-null  object 
 2   address_seq1           55844 non-null  float64
 3   address_seq2           852 non-null    float64
 4   address_seq3           72 non-null     float64
 5   career_end             55850 non-null  int64  
 6   career_start           55850 non-null  int64  
 7   check_box_keyword      55850 non-null  object 
 8   education              55850 non-null  int64  
 9   major_task             55850 non-null  int64  
 10  qualifications         55850 non-null  int64  
 11  text_keyword_x         5419 non-null   object 
 12  company_type_seq       19237 non-null  float64
 13  supply_kind            19237 non-null  float64
 14  employee               19237 non-null  float64
 15  re

조건 2 적용

In [190]:
new_merged_total = new_merged_total[~((new_merged_total['career_level'] == 2) & (new_merged_total['career_month'] == new_merged_total['last_salary']))]

In [191]:
new_merged_total

Unnamed: 0,resume_seq,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,...,univ_transfer,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score,language,exam_name,score,career_level
0,U05833,R03838,3.0,,,0,0,2110;2203;2204;2299,3,2,...,0,3,,,3,60.0,,,,0
1,U06456,R02144,3.0,,,0,0,2204;2507;2707,3,2,...,0,10,국어국문학과,,8,70.0,,,,0
2,U07807,R01877,3.0,,,0,0,2507;2707,3,2,...,0,15,,,4,80.0,,,,0
3,U04842,R02463,3.0,,,0,0,2507;2707,4,2,...,0,3,,,9,80.0,,,,0
4,U08336,R00112,3.0,,,0,0,2507;2707,3,8,...,0,17,,,9,70.0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57941,U02270,R03430,3.0,,,0,0,2507;2707,3,2,...,0,5,패션디자인과종합,,9,0.0,,,,0
57942,U02640,R04987,3.0,1.0,,0,0,2707;2507,3,2,...,0,3,조소학과,,9,70.0,,,,0
57943,U08238,R01342,3.0,,,0,0,2201;2204;2205;2707,2,2,...,0,12,,,3,70.0,,,,0
57944,U01296,R06363,3.0,,,0,0,2201;2204;2205;2707,3,2,...,0,18,의상디자인학,,9,80.0,,,,0


조건 3 적용

In [192]:
#new_merged_total = new_merged_total.drop(new_merged_total[(new_merged_total['qualifications']==2) & (new_merged_total['certificate_contents']== 0)])

In [193]:
#new_merged_total

In [194]:
new_apply_train = new_merged_total[['resume_seq', 'recruitment_seq']]
new_apply_train

Unnamed: 0,resume_seq,recruitment_seq
0,U05833,R03838
1,U06456,R02144
2,U07807,R01877
3,U04842,R02463
4,U08336,R00112
...,...,...
57941,U02270,R03430
57942,U02640,R04987
57943,U08238,R01342
57944,U01296,R06363


In [195]:
#before
apply_train.shape

(57946, 2)

In [196]:
apply_train['resume_seq'].nunique(), apply_train['recruitment_seq'].nunique()

(8482, 6695)

In [197]:
apply_train['resume_seq'].value_counts()

U06543    77
U07490    69
U06166    65
U06206    63
U06125    63
          ..
U05406     2
U00727     2
U04660     2
U08300     2
U05310     2
Name: resume_seq, Length: 8482, dtype: int64

In [198]:
#after
new_apply_train.shape

(55838, 2)

In [199]:
new_apply_train['resume_seq'].nunique(), new_apply_train['recruitment_seq'].nunique()

(8420, 6694)

In [200]:
#missing_resume

missing_resume = list(set(apply_train['resume_seq']) - set(new_apply_train['resume_seq']))
len(missing_resume)

62

In [201]:
#just_one_resume
unique_resume_seq = new_apply_train['resume_seq'].value_counts()
just_one_resume = unique_resume_seq[unique_resume_seq == 1]
just_one_resume = just_one_resume.index.tolist()

In [202]:
#resume_seq가 전부 다 날라간 애들이 적어도 두개는 new_apply_train에 포함되도록 랜덤하게 추출하여 add
new_rows = pd.DataFrame()

for resume in missing_resume:
    temp = apply_train[apply_train['resume_seq'] == resume]
    random_rows = temp.sample(n=2, random_state=args.random_state)
    new_rows = pd.concat([new_rows, random_rows], ignore_index=True)

# new_apply_train에 새로운 행 추가
new_apply_train = pd.concat([new_apply_train, new_rows], ignore_index=True)

In [203]:
new_apply_train['resume_seq'].nunique(), new_apply_train['recruitment_seq'].nunique()

(8482, 6694)

In [204]:
for resume in tqdm(just_one_resume):
    temp = apply_train[apply_train['resume_seq'] == resume]

    # new_apply_train에서 resume_seq가 resume인 항목들만 필터링
    new_apply_subset = new_apply_train[new_apply_train['resume_seq'] == resume]

    # temp에서 랜덤으로 1개의 행을 추출하고, new_apply_subset에 없는 행을 찾을 때까지 반복
    while True:
        random_row = temp.sample(n=1, random_state=args.random_state)

        # new_apply_subset에 random_row가 존재하지 않는 경우에만 추가하고 반복 중단
        if not random_row.index.isin(new_apply_subset.index).any():
            new_apply_train = pd.concat([new_apply_train, random_row])
            break

  0%|          | 0/97 [00:00<?, ?it/s]

In [205]:
#모든 resume가 다  채워졌나?

new_apply_train['resume_seq'].nunique(), new_apply_train['recruitment_seq'].nunique()

(8482, 6694)

In [206]:
#missing_recruitment

missing_recruitment = list(set(apply_train['recruitment_seq']) - set(new_apply_train['recruitment_seq']))
len(missing_recruitment)

1

In [207]:
#new_apply_train에 최소 한개의 recruitment는 들어가도록 추가(train/test)

for recruitment in missing_recruitment:
    temp = apply_train[apply_train['recruitment_seq'] == recruitment]
    if not temp.empty:  # temp가 비어 있지 않은 경우에만 실행
        random_row = temp.sample(n=1, random_state=args.random_state)
        new_apply_train = pd.concat([new_apply_train, random_row], ignore_index=True)

In [208]:
#모든 recruitment가 다  채워졌나?

new_apply_train['resume_seq'].nunique(), new_apply_train['recruitment_seq'].nunique()

(8482, 6695)

In [209]:
#결과 비교
print(f"before shape : {apply_train.shape}, after shape : {new_apply_train.shape}")

before shape : (57946, 2), after shape : (56060, 2)


# Apply_Matrix 생성

In [210]:
#학습, 검증 분리
train, test = [], []
new_apply_train_groupby = new_apply_train.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(new_apply_train_groupby.index.tolist(), new_apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    test.append([uid, iids[-1]])

In [211]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
test = pd.DataFrame(test, columns=['resume_seq', 'recruitment_seq'])

In [212]:
train.shape, test.shape

((47578, 2), (8482, 2))

In [213]:
pred = apply_train.copy()

In [214]:
from sklearn.metrics.pairwise import cosine_similarity

In [215]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix  = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [216]:
train_user_similarity  = cosine_similarity(train_user_item_matrix) # 이력서 간의 유사성 계산
pred_user_similarity  = cosine_similarity(pred_user_item_matrix) # apply 이력서 간의 유사성 계산

In [217]:
train_item_similarity = cosine_similarity(train_user_item_matrix.T) # 공고 간의 유사성 계산
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T) # apply 공고 간의 유사성 계산

In [218]:
train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
# 이력서간의 유사도 X 이력서-공고 매트릭스 행렬곱
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)
# 이력서-공고 매트릭스 X 공고간의 유사도 행렬곱

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

# train/test 결과 도출

In [219]:
alpha = 0.5
beta = 0.5
train_recommendations = []

for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
    applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx] * beta).argsort()[::-1]

    # 이미 지원하지 않은 공고를 선택
    recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

    for job in recommended_jobs:
        train_recommendations.append([user, job])

0it [00:00, ?it/s]

# Recall

In [220]:
def recall5(answer_df, submission_df):

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # submission의 예측이 각각 5개인지 확인
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # submission의 예측된 값들에 null값이 있는지 확인
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # 예측값에 중복이 있는지 확인
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    # primary_col 즉 resume_seq가 양측에 있는지 확인 후 남김
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    #
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head().tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [221]:
train_recommendations = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [222]:
recall5(test,train_recommendations)

0.12155152086771988

# 최적의 Alpha, Beta 값 탐색

In [223]:
def calculate_recommendations(alpha, train_user_item_matrix, train_item_predicted_scores, train_user_predicted_scores):
    train_recommendations = []

    for idx, user in enumerate(train_user_item_matrix.index):
        applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)
        sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx] * (1 - alpha)).argsort()[::-1]
        recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

        for job in recommended_jobs:
            train_recommendations.append([user, job])

    return pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [224]:
# 초기화
n_iter = 0
max_iter = 10
pbar = tqdm(total=max_iter)
best_score = 0
best_alpha = 0.0
best_beta = 0.0

while n_iter < max_iter:
    alpha = round(random.uniform(0, 1), 3)
    train_recommendations = calculate_recommendations(alpha, train_user_item_matrix, train_item_predicted_scores, train_user_predicted_scores)
    score = recall5(test, train_recommendations)

    if score > best_score:
        best_score = score
        best_alpha = alpha
        best_beta = 1 - alpha

    n_iter += 1
    pbar.update(1)

pbar.close()
print(f"Best score: {best_score}, Best alpha: {best_alpha}, Best beta: {best_beta}")


  0%|          | 0/10 [00:00<?, ?it/s]

Best score: 0.12225890120254657, Best alpha: 0.383, Best beta: 0.617


# Submission

In [225]:
top_recommendations = []

for idx, user in tqdm(enumerate(pred_user_item_matrix.index)):
    applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = (pred_item_predicted_scores.loc[user].values * best_alpha + pred_user_predicted_scores[idx] * best_beta).argsort()[::-1]

    # 이미 지원하지 않은 공고를 선택
    recommended_jobs = [job for job in pred_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]


    for job in recommended_jobs:
        top_recommendations.append([user, job])

0it [00:00, ?it/s]

In [226]:
pred_recommendations = pd.DataFrame(top_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [227]:
now = datetime.now() + timedelta(hours=9)
formatted_time = now.strftime("%Y%m%d_%H%M")
formatted_time

'20231112_1851'

In [228]:
pred_recommendations.to_csv(args.default_path+ '/result/submission_'+formatted_time+'.csv', index=False)

OSError: ignored