In [1]:
from collections import Counter
import pandas as pd
import numpy as np

def get_illegal_ids_by_inter_num(df, field, max_num=None, min_num=None):
    if field is None:
        return set()
    if max_num is None and min_num is None:
        return set()

    max_num = max_num or np.inf
    min_num = min_num or -1

    ids = df[field].values
    inter_num = Counter(ids)
    ids = {id_ for id_ in inter_num if inter_num[id_] < min_num or inter_num[id_] > max_num}
    return ids

def filter_by_k_core(df, learner_id, course_id, min_u_num, min_i_num):
    while True:
        ban_users = get_illegal_ids_by_inter_num(df, field=learner_id, max_num=None, min_num=min_u_num)
        ban_items = get_illegal_ids_by_inter_num(df, field=course_id, max_num=None, min_num=min_i_num)
        if len(ban_users) == 0 and len(ban_items) == 0:
            return df

        dropped_inter = pd.Series(False, index=df.index)
        if learner_id:
            dropped_inter |= df[learner_id].isin(ban_users)
        if course_id:
            dropped_inter |= df[course_id].isin(ban_items)
        df = df[~dropped_inter]
    return df


In [20]:
df = pd.read_csv('../train_pre.csv')
df

Unnamed: 0,user_id,item_id,rating,x_label
0,114341,9124,5.0,0
1,114341,32109,4.0,0
2,114341,44195,5.0,0
3,114341,24427,5.0,0
4,114341,10994,5.0,1
...,...,...,...,...
1254436,173685,2496,5.0,0
1254437,173685,6056,5.0,1
1254438,43567,42968,5.0,2
1254439,43567,6056,5.0,1


In [21]:
# 필터링 조건 설정
learner_id = 'user_id'
course_id = 'item_id'
min_u_num = 5
min_i_num = 5

# 데이터프레임 필터링 및 가공
filtered_df = filter_by_k_core(df, learner_id, course_id, min_u_num, min_i_num)

# 결과 출력
filtered_df

Unnamed: 0,user_id,item_id,rating,x_label
8,85622,26388,5.0,1
10,85622,8209,4.0,0
11,85622,26617,5.0,0
12,85622,45984,5.0,0
13,85622,32830,5.0,0
...,...,...,...,...
1254413,97639,26952,5.0,0
1254414,97639,14737,5.0,0
1254415,97639,23532,4.0,0
1254416,97639,3966,5.0,0


In [22]:
filtered_df['item_id'].value_counts()

item_id
11643    2615
26001    2182
9350     1843
38026    1703
10501    1465
         ... 
16555       5
50780       5
20337       5
58514       5
19781       5
Name: count, Length: 38653, dtype: int64

In [23]:
filtered_df['user_id'].value_counts()

user_id
19883     325
88922     297
98284     258
85327     258
108539    254
         ... 
132436      5
52225       5
25213       5
53083       5
97639       5
Name: count, Length: 87113, dtype: int64

In [7]:
#filtered_df = filtered_df.sort_values(by='user_id', ascending=True)
filtered_df

Unnamed: 0,user_id,item_id,rating
968758,1,7960,5.0
968762,1,10454,5.0
968761,1,51383,5.0
968760,1,52670,5.0
968759,1,34715,5.0
...,...,...,...
640394,192402,47205,5.0
640393,192402,40733,5.0
640391,192402,45041,5.0
640395,192402,39050,5.0


In [24]:
filtered_df['x_label'] = 1
filtered_df.reset_index(drop=True, inplace=True)
filtered_df

Unnamed: 0,user_id,item_id,rating,x_label
0,85622,26388,5.0,1
1,85622,8209,4.0,1
2,85622,26617,5.0,1
3,85622,45984,5.0,1
4,85622,32830,5.0,1
...,...,...,...,...
821403,97639,26952,5.0,1
821404,97639,14737,5.0,1
821405,97639,23532,4.0,1
821406,97639,3966,5.0,1


In [25]:
filtered_df.to_csv('../kcore5_to_UI_new2.inter', index = False)

# 데이터 증강해서 최소 아이템수 5-> 10로 늘리기

In [12]:
df = filtered_df

item_counts = df['item_id'].value_counts()
valid_item_ids = item_counts[item_counts >= 100].index
valid_items_df = df[df['item_id'].isin(valid_item_ids)]
average_ratings = valid_items_df.groupby('item_id')['rating'].mean()
sorted_ratings = average_ratings.sort_values(ascending=False)

high_rated_items = sorted_ratings[sorted_ratings >= 3]

random_selected_items = np.random.choice(high_rated_items.index, size=5, replace=False)
print('랜덤 5',random_selected_items)

new_rows = []
for user_id in df['user_id'].unique():
    for item_id in random_selected_items:
        new_row = {
            'user_id': user_id,
            'item_id': item_id,
            'rating': average_ratings[item_id],  # 해당 아이템의 평균 평점으로 설정
            'x_label': 0
        }
        new_rows.append(new_row)

# 새로운 행을 담은 데이터프레임 생성
new_data_df = pd.DataFrame(new_rows)

# 결과 출력
print(new_data_df)

랜덤 5 [54969 56534   860 54412 28507]
        user_id  item_id    rating  x_label
0             1    54969  4.679688        0
1             1    56534  3.956897        0
2             1      860  4.360000        0
3             1    54412  4.155738        0
4             1    28507  4.597855        0
...         ...      ...       ...      ...
435560   192402    54969  4.679688        0
435561   192402    56534  3.956897        0
435562   192402      860  4.360000        0
435563   192402    54412  4.155738        0
435564   192402    28507  4.597855        0

[435565 rows x 4 columns]


In [14]:
auged_df = pd.concat([new_data_df, filtered_df], ignore_index=True)

# 중복 확인
duplicates = auged_df.duplicated()
duplicated_rows = auged_df[duplicates]

# 결과 출력
print("Combined DataFrame:")
print(auged_df)
print("\nDuplicated Rows:")
print(duplicated_rows)
print(auged_df['x_label'].value_counts())

Combined DataFrame:
         user_id  item_id    rating  x_label
0              1    54969  4.679688        0
1              1    56534  3.956897        0
2              1      860  4.360000        0
3              1    54412  4.155738        0
4              1    28507  4.597855        0
...          ...      ...       ...      ...
1256968   192402    47205  5.000000        1
1256969   192402    40733  5.000000        1
1256970   192402    45041  5.000000        1
1256971   192402    39050  5.000000        1
1256972   192402    16014  4.000000        1

[1256973 rows x 4 columns]

Duplicated Rows:
Empty DataFrame
Columns: [user_id, item_id, rating, x_label]
Index: []
x_label
1    821408
0    435565
Name: count, dtype: int64


In [15]:
auged_df['item_id'].value_counts()

item_id
28507    87859
56534    87345
54969    87241
54412    87235
860      87213
         ...  
35446        5
11688        5
62351        5
15659        5
10686        5
Name: count, Length: 38653, dtype: int64

In [16]:
auged_df['user_id'].value_counts()

user_id
19883     330
88922     302
85327     263
98284     263
108539    259
         ... 
55237      10
55248      10
139718     10
55250      10
96280      10
Name: count, Length: 87113, dtype: int64

In [17]:
auged_df.reset_index(drop=True, inplace=True)
auged_df

Unnamed: 0,user_id,item_id,rating,x_label
0,1,54969,4.679688,0
1,1,56534,3.956897,0
2,1,860,4.360000,0
3,1,54412,4.155738,0
4,1,28507,4.597855,0
...,...,...,...,...
1256968,192402,47205,5.000000,1
1256969,192402,40733,5.000000,1
1256970,192402,45041,5.000000,1
1256971,192402,39050,5.000000,1


In [18]:
auged_df.to_csv('../kcore5_auged.inter', index = False)