# Connect to google account

In [2]:
# 구글 드라이브 연결
from google.colab import drive
drive.mount("/content/data")

Mounted at /content/data


In [3]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m41.0/81.9 kB[0m [31m955.9 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [4]:
# 모듈 Import
import random
import numpy as np # 행렬 계산에 사용하는 모듈
import pandas as pd # 데이터 처리와 분석을 위한 모듈
import matplotlib.pyplot as plt # 데이터 시각화를 위한 모듈. 2D, 3D 그릴 때 사용
import seaborn as sns # 데이터 시각화를 위한 모듈. 두 데이터의 관계를 볼때 사용

import category_encoders as ce

from datetime import datetime, timedelta
from tqdm.auto import tqdm

import warnings
# FutureWarning 경고를 무시하도록 설정
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
# 딕셔너리를 간편하게 사용하는 것을 도와준다. 모델 class 처럼 사용 가능
# dot을 이용해 객체를 불러 사용. JSON 다룰때 유용.
import easydict
args = easydict.EasyDict()

# path
args.default_path = "/content/data/MyDrive/Playdata/Competitions/ML/Dacon/" # 메인 경로
args.apply_train_path = args.default_path + "apply_train.csv" # train 데이터 경로
args.company_path = args.default_path + "company.csv"
args.recruitment_path = args.default_path + "recruitment.csv"
args.resume_certificate_path = args.default_path + "resume_certificate.csv"
args.resume_education_path = args.default_path + "resume_education.csv"
args.resume_language_path = args.default_path + "resume_language.csv"
args.resume_path = args.default_path + "resume.csv"

args.default_submission_path = args.default_path + "sample_submission.csv" # 예측결과(제출파일) 경로

# 데이터 분석을 위한 변수들
# # 난수 생성 제어 => 같은 코드를 실행해도 동일한 결과를 얻기 위해서 설정
# 데이터 분할 및 모델 초기화 때 유용.
args.random_state = 42
args.results = [] # 결과 저장 리스트

In [6]:
args.submission_path = args.default_path + "result/submission_20231112.csv" # 결과 저장 파일
args.save_results = args.default_path+"result/model_results_20231112.json" # 결과 저장 json

In [7]:
def recall5(answer_df, submission_df):

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # submission의 예측이 각각 5개인지 확인
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # submission의 예측된 값들에 null값이 있는지 확인
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # 예측값에 중복이 있는지 확인
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    # primary_col 즉 resume_seq가 양측에 있는지 확인 후 남김
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    #
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    #recall = np.mean(individual_recalls)
    recall = max(individual_recalls)
    return recall

# File Load

In [8]:
apply_train_df = pd.read_csv(args.apply_train_path) # apply_train -> DataFrame화
company_df = pd.read_csv(args.company_path) # company -> DataFrame화
recruitment_df = pd.read_csv(args.recruitment_path) # recruitment -> DataFrame화
resume_certificate_df = pd.read_csv(args.resume_certificate_path) # resume_certificate -> DataFreame화
resume_education_df = pd.read_csv(args.resume_education_path) # resume_education -> DataFrame화
resume_language_df = pd.read_csv(args.resume_language_path) # resume_language -> DataFrame화
resume_df = pd.read_csv(args.resume_path) # resume -> DataFrame화
submission_df = pd.read_csv(args.default_submission_path)

In [9]:
apply_train = apply_train_df.copy()
company = company_df.copy()
recruitment = recruitment_df.copy()
resume_certificate = resume_certificate_df.copy()
resume_education = resume_education_df.copy()
resume_language = resume_language_df.copy()
resume = resume_df.copy()


### merged_recruitment 생성

In [10]:
# recruitment, company 정렬(recruitment_seq 기준)
company = company.sort_values(by = "recruitment_seq")
recruitment = recruitment.sort_values(by = "recruitment_seq")

In [11]:
# merged_recruitment = recruitment, company merge
merged_recruitment = pd.merge(recruitment, company, on='recruitment_seq', how='left')
merged_recruitment

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,text_keyword,company_type_seq,supply_kind,employee
0,R00001,5.0,,,0,0,2101;2108;2201;2204;2205;2707;2810,2,2,1,,5.0,201.0,631.0
1,R00002,3.0,,,0,0,2507;2703;2707,3,2,1,,2.0,201.0,160.0
2,R00003,3.0,,,0,0,2101;2108;2201;2707,3,2,2,,,,
3,R00004,3.0,,,0,0,2507;2707,3,2,1,,2.0,402.0,500.0
4,R00005,3.0,,,0,0,2507;2707,3,2,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,R06691,3.0,,,0,0,2501;2507;2707,3,2,1,,,,
6691,R06692,3.0,,,0,0,2201;2507,3,2,2,,4.0,402.0,150.0
6692,R06693,5.0,,,0,0,2102;2707,4,2,1,,,,
6693,R06694,3.0,,,0,0,2101;2108;2109;2110;2201;2203;2707,4,2,1,,,,


### merged_resume 생성

In [12]:
# resume 정렬 (resume_seq 기준)
resume = resume.sort_values(by='resume_seq')

In [13]:
# resume_certificate 정렬 (resume_seq 기준)
resume_certificate = resume_certificate.sort_values(by='resume_seq')
# certificate_contents에 결측값이 존재하는 행 제거
resume_certificate = resume_certificate.dropna(subset=['certificate_contents'])
# 이력서번호(resume_seq)기준으로 groupby
# 이력서번호(resume_seq)의 해당하는 자격증들을 세미콜론(;)으로 join해서 적용(apply)
resume_certificate = resume_certificate.groupby('resume_seq')['certificate_contents'].apply(';'.join).reset_index()

In [14]:
# resume_education 정렬 (resume_seq 기준)
resume_education = resume_education.sort_values(by='resume_seq')

In [15]:
# resume_language 정렬 (resume_seq 기준)
resume_language = resume_language.sort_values(by='resume_seq')

In [16]:
# merged_resume = resume, resume_certificate, resume_education, resume_language
merged_resume = pd.merge(resume, resume_certificate, on='resume_seq', how='left')
merged_resume = pd.merge(merged_resume, resume_education, on='resume_seq', how='left')
merged_resume = pd.merge(merged_resume, resume_language, on='resume_seq', how='left')
merged_resume

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword,job_code_seq1,job_code_seq2,...,univ_type_seq2,univ_transfer,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score,language,exam_name,score
0,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,5,0,17,,,9,20.0,,,
1,U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,...,5,0,3,,,8,90.0,,,
2,U00003,2018-02-14,2020-07-08,4,2004,0.0,0.0,남성복디자이너;TD캐주얼,재료·화학·섬유·의복,,...,5,0,17,,,4,90.0,,,
3,U00004,2017-10-26,2020-04-27,4,2011,0.0,0.0,상품기획;영업기획,재료·화학·섬유·의복,,...,5,0,5,,,3,70.0,2.0,11.0,661.62
4,U00005,2020-03-18,2020-04-08,4,2011,0.0,0.0,인사;총무;경영;MD;상품기획,재료·화학·섬유·의복,,...,0,0,0,,,10,50.0,2.0,4.0,873.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8526,U08478,2020-02-05,2020-03-31,4,2003,0.0,2700.0,상품기획,재료·화학·섬유·의복,,...,5,0,5,,,19,0.0,,,
8527,U08479,2016-02-17,2020-07-15,3,0,2500.0,2300.0,디자이너,재료·화학·섬유·의복,,...,6,0,3,가정과,,9,0.0,,,
8528,U08480,2019-11-13,2020-06-24,4,0,0.0,0.0,VMD;텍스타일디자인;섬유디자인;니트디자인;여성복디자인,디자인,,...,5,0,10,미술대학섬유패션코디네이션학과,,9,80.0,,,
8529,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,...,5,0,5,,,9,60.0,,,


### merged_total 생성

In [17]:
_merged_total = pd.merge(apply_train, merged_recruitment, left_on='recruitment_seq', right_on='recruitment_seq', how='left')
_merged_total = pd.merge(_merged_total, merged_resume, left_on='resume_seq', right_on='resume_seq', how='left')
merged_total = _merged_total
merged_total.head()

Unnamed: 0,resume_seq,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,...,univ_type_seq2,univ_transfer,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score,language,exam_name,score
0,U05833,R03838,3.0,,,0,0,2110;2203;2204;2299,3,2,...,5,0,3,,,3,60.0,,,
1,U06456,R02144,3.0,,,0,0,2204;2507;2707,3,2,...,5,0,10,국어국문학과,,8,70.0,,,
2,U07807,R01877,3.0,,,0,0,2507;2707,3,2,...,5,0,15,,,4,80.0,,,
3,U04842,R02463,3.0,,,0,0,2507;2707,4,2,...,5,0,3,,,9,80.0,,,
4,U08336,R00112,3.0,,,0,0,2507;2707,3,8,...,5,0,17,,,9,70.0,,,


In [18]:
merged_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58220 entries, 0 to 58219
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   resume_seq             58220 non-null  object 
 1   recruitment_seq        58220 non-null  object 
 2   address_seq1           58214 non-null  float64
 3   address_seq2           880 non-null    float64
 4   address_seq3           73 non-null     float64
 5   career_end             58220 non-null  int64  
 6   career_start           58220 non-null  int64  
 7   check_box_keyword      58220 non-null  object 
 8   education              58220 non-null  int64  
 9   major_task             58220 non-null  int64  
 10  qualifications         58220 non-null  int64  
 11  text_keyword_x         5650 non-null   object 
 12  company_type_seq       20127 non-null  float64
 13  supply_kind            20127 non-null  float64
 14  employee               20127 non-null  float64
 15  re

# New_apply_train 조건 탐색

## 조건 1. Education / Degree

In [19]:
merged_total.shape

(58220, 44)

In [20]:
merged_total.groupby('education')['degree'].value_counts(ascending=False)

education  degree
2          4         12682
           3          1021
           5           530
           2           516
           6            17
3          4         21016
           3          1602
           5           847
           2           728
           6            19
4          4         17018
           3           913
           5           838
           2           447
           6            15
6          4             9
           3             1
           5             1
Name: degree, dtype: int64

In [21]:
merged_total.groupby('degree')['education'].value_counts(ascending=False)

degree  education
2       3              728
        2              516
        4              447
3       3             1602
        2             1021
        4              913
        6                1
4       3            21016
        4            17018
        2            12682
        6                9
5       3              847
        4              838
        2              530
        6                1
6       3               19
        2               17
        4               15
Name: education, dtype: int64

In [22]:
#요구학위보다 최종학위가 높은 경우
merged_total.loc[merged_total['education'] > merged_total['degree']].shape

(2099, 44)

In [23]:
#학위를 두 수준이나 낮춰서 하향지원한 경우
##요구학위 > 최종학위가 높은 경우를 먼저 drop 하기 때문에 요구학위 < 최종학위인 경우만 생각하면 됨
###degree = 6인 지원자는 반드시 하향지원을 할 수 밖에없어서 예외처리
merged_total.loc[
    (merged_total['degree'] != 6)
    &
    (merged_total['degree'] - merged_total['education'] == 2)
    ].shape

(13529, 44)

In [24]:
def drop_degree(df):
    # 'education' 컬럼 값이 'degree' 컬럼 값보다 큰 조건을 만족하는 행 삭제
    df.drop(df[df['education'] > df['degree']].index, inplace=True)
    #df.drop(df[(merged_total['degree'] != 6)
                #&
                #(merged_total['degree'] - merged_total['education'] == 2)
                #].index, inplace=True)
    return df

## 조건 2. 신입 / 경력

- 신입은 0, 경력은 1

In [25]:
# 공고 기준으로 groupby해서 최종월급이 0인 지원자들(즉, 신입)이 많이 지원했다면 그 공고는 신입 포지션이었을 것이다.
# 신입 포지션이면 0, 경력 포지션이면 1
mode_career_month = merged_total.groupby('recruitment_seq')['last_salary'].agg(lambda x: x.mode().iloc[0]).reset_index()

# position 컬럼 생성
mode_career_month['position'] = mode_career_month['last_salary'].apply(lambda x: 1 if x != 0 else 0)

# 결과 확인
mode_career_month

Unnamed: 0,recruitment_seq,last_salary,position
0,R00001,4100.0,1
1,R00002,0.0,0
2,R00003,2300.0,1
3,R00004,0.0,0
4,R00005,2900.0,1
...,...,...,...
6690,R06691,0.0,0
6691,R06692,3300.0,1
6692,R06693,0.0,0
6693,R06694,0.0,0


In [26]:
mode_career_month.drop('last_salary', axis=1, inplace=True)
merged_total = pd.merge(merged_total, mode_career_month, on='recruitment_seq', how='left')
merged_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58220 entries, 0 to 58219
Data columns (total 45 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   resume_seq             58220 non-null  object 
 1   recruitment_seq        58220 non-null  object 
 2   address_seq1           58214 non-null  float64
 3   address_seq2           880 non-null    float64
 4   address_seq3           73 non-null     float64
 5   career_end             58220 non-null  int64  
 6   career_start           58220 non-null  int64  
 7   check_box_keyword      58220 non-null  object 
 8   education              58220 non-null  int64  
 9   major_task             58220 non-null  int64  
 10  qualifications         58220 non-null  int64  
 11  text_keyword_x         5650 non-null   object 
 12  company_type_seq       20127 non-null  float64
 13  supply_kind            20127 non-null  float64
 14  employee               20127 non-null  float64
 15  re

In [27]:
#경력 월이 12개월 미만이면(즉, 신입이면) 0, 아니면 1
merged_total['is_newbie'] = merged_total['career_month'].apply(lambda x: 0 if x < 12 else 1)
merged_total.head()

Unnamed: 0,resume_seq,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,...,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score,language,exam_name,score,position,is_newbie
0,U05833,R03838,3.0,,,0,0,2110;2203;2204;2299,3,2,...,3,,,3,60.0,,,,0,1
1,U06456,R02144,3.0,,,0,0,2204;2507;2707,3,2,...,10,국어국문학과,,8,70.0,,,,1,1
2,U07807,R01877,3.0,,,0,0,2507;2707,3,2,...,15,,,4,80.0,,,,0,1
3,U04842,R02463,3.0,,,0,0,2507;2707,4,2,...,3,,,9,80.0,,,,0,0
4,U08336,R00112,3.0,,,0,0,2507;2707,3,8,...,17,,,9,70.0,,,,0,0


In [28]:
#신입인데 경력포지션에 지원한 경우
merged_total[(merged_total['position'] == 1) & (merged_total['is_newbie'] == 0)].shape

(681, 46)

In [29]:
#경력인데 신입포지션에 지원한 경우
merged_total[(merged_total['position'] == 0) & (merged_total['is_newbie'] == 1)].shape

(33055, 46)

왤캐 많지???????????????

In [None]:
#경력인데 신입포지션에 지원한 경우가 많을 수 있다. 직무 전환일 경우..

In [30]:
def drop_position(df):
    #신입인데 경력포지션에 지원한 경우 삭제
    df.drop(df[(df['position'] == 1) & (df['is_newbie'] == 0)].index, inplace=True)
    return df

## 조건 3. 고등학교지역 / 회사근무지


In [33]:
merged_total["hischool_location_seq"].unique()

array([ 3, 15, 13,  0,  7, 10, 18, 11,  8,  6,  9, 12,  5, 17, 14,  4, 16])

In [34]:
merged_total["address_seq1"].unique()

array([ 3.,  5., 20., 11.,  9.,  1., nan, 13.,  2.])

In [31]:
merged_total.groupby("hischool_location_seq")["address_seq1"].value_counts(ascending = False)

hischool_location_seq  address_seq1
0                      3.0             9915
                       5.0              288
                       20.0              25
                       1.0                5
                       13.0               4
                                       ... 
17                     1.0                1
18                     3.0              904
                       5.0               20
                       20.0               2
                       1.0                1
Name: address_seq1, Length: 78, dtype: int64

In [32]:
merged_total.groupby("address_seq1")["hischool_location_seq"].value_counts(ascending = False)

address_seq1  hischool_location_seq
1.0           3                        14
              0                         5
              5                         4
              9                         1
              13                        1
                                       ..
20.0          10                        2
              12                        2
              15                        2
              18                        2
              8                         1
Name: hischool_location_seq, Length: 78, dtype: int64

## 조건 4. 신입경력에 따른 자격요건난이도

In [35]:
merged_total["qualifications"].unique()

array([1, 2, 0])

In [36]:
merged_total[["qualifications","position", "is_newbie"]]

Unnamed: 0,qualifications,position,is_newbie
0,1,0,1
1,1,1,1
2,1,0,1
3,1,0,0
4,1,0,0
...,...,...,...
58215,1,0,0
58216,1,0,1
58217,1,1,1
58218,1,1,1


In [37]:
# 경력이 난이도 0에 지원한 경우
merged_total[(merged_total['qualifications'] == 0) & (merged_total['is_newbie'] == 1)].shape

(383, 46)

In [39]:
# 경력이 난이도 1에 지원한 경우
merged_total[(merged_total['qualifications'] == 1) & (merged_total['is_newbie'] == 1)].shape

(34368, 46)

In [40]:
# 경력이 난이도 2에 지원한 경우
merged_total[(merged_total['qualifications'] == 2) & (merged_total['is_newbie'] == 1)].shape

(15932, 46)

In [41]:
# 신입이 난이도 0에 지원한 경우
merged_total[(merged_total['qualifications'] == 0) & (merged_total['is_newbie'] == 0)].shape

(32, 46)

In [42]:
# 신입이 난이도 1에 지원한 경우
merged_total[(merged_total['qualifications'] == 1) & (merged_total['is_newbie'] == 0)].shape

(3578, 46)

In [43]:
# 신입이 난이도 2에 지원한 경우
merged_total[(merged_total['qualifications'] == 2) & (merged_total['is_newbie'] == 0)].shape

(3927, 46)

# New_apply_train 생성 및 검증

조건 1만 적용해서 new_apply_train을 생성한다

In [None]:
new_merged_total = drop_degree(merged_total)

In [None]:
new_apply_train = new_merged_total[['resume_seq', 'recruitment_seq']]
new_apply_train.head()

Unnamed: 0,resume_seq,recruitment_seq
0,U05833,R03838
1,U06456,R02144
2,U07807,R01877
3,U04842,R02463
4,U08336,R00112


In [None]:
#before
apply_train.shape

(57946, 2)

In [None]:
apply_train['resume_seq'].nunique(), apply_train['recruitment_seq'].nunique()

(8482, 6695)

In [None]:
apply_train['resume_seq'].value_counts()

U06543    77
U07490    69
U06166    65
U06206    63
U06125    63
          ..
U05406     2
U00727     2
U04660     2
U08300     2
U05310     2
Name: resume_seq, Length: 8482, dtype: int64

In [None]:
#after
new_apply_train.shape

(56121, 2)

In [None]:
new_apply_train['resume_seq'].nunique(), new_apply_train['recruitment_seq'].nunique()

(8420, 6694)

In [None]:
#missing_resume

missing_resume = list(set(apply_train['resume_seq']) - set(new_apply_train['resume_seq']))
len(missing_resume)

62

In [None]:
#just_one_resume
unique_resume_seq = new_apply_train['resume_seq'].value_counts()
just_one_resume = unique_resume_seq[unique_resume_seq == 1]
just_one_resume = just_one_resume.index.tolist()

In [None]:
#resume_seq가 전부 다 날라간 애들이 적어도 두개는 new_apply_train에 포함되도록 랜덤하게 추출하여 add
new_rows = pd.DataFrame()

for resume in missing_resume:
    temp = apply_train[apply_train['resume_seq'] == resume]
    random_rows = temp.sample(n=2, random_state=args.random_state)
    new_rows = pd.concat([new_rows, random_rows], ignore_index=True)

# new_apply_train에 새로운 행 추가
new_apply_train = pd.concat([new_apply_train, new_rows], ignore_index=True)

In [None]:
for resume in tqdm(just_one_resume):
    temp = apply_train[apply_train['resume_seq'] == resume]

    # new_apply_train에서 resume_seq가 resume인 항목들만 필터링
    new_apply_subset = new_apply_train[new_apply_train['resume_seq'] == resume]

    # temp에서 랜덤으로 1개의 행을 추출하고, new_apply_subset에 없는 행을 찾을 때까지 반복
    while True:
        random_row = temp.sample(n=1, random_state=args.random_state)

        # new_apply_subset에 random_row가 존재하지 않는 경우에만 추가하고 반복 중단
        if not random_row.index.isin(new_apply_subset.index).any():
            new_apply_train = pd.concat([new_apply_train, random_row])
            break

  0%|          | 0/95 [00:00<?, ?it/s]

In [None]:
#모든 resume가 다  채워졌나?

new_apply_train['resume_seq'].nunique(), new_apply_train['recruitment_seq'].nunique()

(8482, 6694)

In [None]:
#missing_recruitment

missing_recruitment = list(set(apply_train['recruitment_seq']) - set(new_apply_train['recruitment_seq']))
len(missing_recruitment)

1

In [None]:
#new_apply_train에 최소 한개의 recruitment는 들어가도록 추가(train/test)

for recruitment in missing_recruitment:
    temp = apply_train[apply_train['recruitment_seq'] == recruitment]
    if not temp.empty:  # temp가 비어 있지 않은 경우에만 실행
        random_row = temp.sample(n=1, random_state=args.random_state)
        new_apply_train = pd.concat([new_apply_train, random_row], ignore_index=True)

In [None]:
#모든 recruitment가 다  채워졌나?

new_apply_train['resume_seq'].nunique(), new_apply_train['recruitment_seq'].nunique()

(8482, 6695)

In [None]:
#결과 비교
print(f"before shape : {apply_train.shape}, after shape : {new_apply_train.shape}")

before shape : (57946, 2), after shape : (56341, 2)


# Apply_Matrix 생성

In [None]:
#학습, 검증 분리
train, test = [], []
new_apply_train_groupby = new_apply_train.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(new_apply_train_groupby.index.tolist(), new_apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    test.append([uid, iids[-1]])

In [None]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
test = pd.DataFrame(test, columns=['resume_seq', 'recruitment_seq'])

In [None]:
train.shape, test.shape

((47859, 2), (8482, 2))

In [None]:
pred = apply_train.copy()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix  = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [None]:
train_user_similarity  = cosine_similarity(train_user_item_matrix) # 이력서 간의 유사성 계산
pred_user_similarity  = cosine_similarity(pred_user_item_matrix) # apply 이력서 간의 유사성 계산

In [None]:
train_item_similarity = cosine_similarity(train_user_item_matrix.T) # 공고 간의 유사성 계산
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T) # apply 공고 간의 유사성 계산

In [None]:
train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
# 이력서간의 유사도 X 이력서-공고 매트릭스 행렬곱
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)
# 이력서-공고 매트릭스 X 공고간의 유사도 행렬곱

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

# train/test 결과 도출

In [None]:
alpha = 0.5
beta = 0.5
train_recommendations = []

for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
    applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx] * beta).argsort()[::-1]

    # 이미 지원하지 않은 공고를 선택
    recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

    for job in recommended_jobs:
        train_recommendations.append([user, job])

0it [00:00, ?it/s]

# Recall

In [None]:
def recall5(answer_df, submission_df):

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # submission의 예측이 각각 5개인지 확인
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # submission의 예측된 값들에 null값이 있는지 확인
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # 예측값에 중복이 있는지 확인
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    # primary_col 즉 resume_seq가 양측에 있는지 확인 후 남김
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    #
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head().tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [None]:
train_recommendations = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [None]:
recall5(test,train_recommendations)

0.12131572742277764

# 최적의 Alpha, Beta 값 탐색

In [None]:
def calculate_recommendations(alpha, train_user_item_matrix, train_item_predicted_scores, train_user_predicted_scores):
    train_recommendations = []

    for idx, user in enumerate(train_user_item_matrix.index):
        applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)
        sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx] * (1 - alpha)).argsort()[::-1]
        recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

        for job in recommended_jobs:
            train_recommendations.append([user, job])

    return pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [None]:
# 초기화
n_iter = 0
max_iter = 10
pbar = tqdm(total=max_iter)
best_score = 0
best_alpha = 0.0
best_beta = 0.0

while n_iter < max_iter:
    alpha = round(random.uniform(0, 1), 3)
    train_recommendations = calculate_recommendations(alpha, train_user_item_matrix, train_item_predicted_scores, train_user_predicted_scores)
    score = recall5(test, train_recommendations)

    if score > best_score:
        best_score = score
        best_alpha = alpha
        best_beta = 1 - alpha

    n_iter += 1
    pbar.update(1)

pbar.close()
print(f"Best score: {best_score}, Best alpha: {best_alpha}, Best beta: {best_beta}")


  0%|          | 0/10 [00:00<?, ?it/s]

Best score: 0.12214100448007545, Best alpha: 0.343, Best beta: 0.657


# Submission

In [None]:
top_recommendations = []

for idx, user in tqdm(enumerate(pred_user_item_matrix.index)):
    applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = (pred_item_predicted_scores.loc[user].values * best_alpha + pred_user_predicted_scores[idx] * best_beta).argsort()[::-1]

    # 이미 지원하지 않은 공고를 선택
    recommended_jobs = [job for job in pred_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]


    for job in recommended_jobs:
        top_recommendations.append([user, job])

0it [00:00, ?it/s]

In [None]:
pred_recommendations = pd.DataFrame(top_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [None]:
now = datetime.now() + timedelta(hours=9)
formatted_time = now.strftime("%Y%m%d_%H%M")
formatted_time

'20231112_1640'

In [None]:
pred_recommendations.to_csv(args.default_path+ '/result/submission_'+formatted_time+'.csv', index=False)