In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import os
import tqdm
import json
import polars as pl
import numpy as np
import seaborn as sns

In [19]:
n_core_interactions = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/n_core_user_course.csv')

In [None]:
n_core_interactions

user,course,enroll_time
str,str,str
"""U_69""","""C_679390""","""2019-09-23 14:57:00"""
"""U_69""","""C_696994""","""2019-10-12 03:32:49"""
"""U_69""","""C_697791""","""2019-10-14 15:39:05"""
"""U_69""","""C_696911""","""2019-10-21 12:54:12"""
"""U_69""","""C_875624""","""2019-11-05 11:51:59"""
…,…,…
"""U_34712050""","""C_879054""","""2020-10-14 17:52:03"""
"""U_34712050""","""C_696956""","""2020-10-14 19:00:11"""
"""U_34712050""","""C_797404""","""2020-10-14 21:30:56"""
"""U_34712050""","""C_681422""","""2020-12-01 19:43:47"""


Tạo các quan hệ giữa course và các đối tượng khác

## Knowledge Graph filtering

### course-teacher

In [5]:
course_teacher_df = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/course_teacher_filtered.txt',
                                separator='\t', has_header=False, new_columns=['course', 'teacher'])
course_teacher_df

course,teacher
str,str
"""C_584313""","""T_20"""
"""C_584329""","""T_21"""
"""C_584329""","""T_22"""
"""C_584381""","""T_23"""
"""C_597208""","""T_28"""
…,…
"""C_2344479""","""T_16368"""
"""C_2344479""","""T_16369"""
"""C_2344479""","""T_16370"""
"""C_2344479""","""T_16371"""


In [6]:
course_teacher_df = course_teacher_df.rename({'course': 'h', 'teacher': 't'}) \
                                    .with_columns(pl.lit('course.teacher').alias('r')) \
                                    .select(['h', 'r', 't'])

course_teacher_df

h,r,t
str,str,str
"""C_584313""","""course.teacher""","""T_20"""
"""C_584329""","""course.teacher""","""T_21"""
"""C_584329""","""course.teacher""","""T_22"""
"""C_584381""","""course.teacher""","""T_23"""
"""C_597208""","""course.teacher""","""T_28"""
…,…,…
"""C_2344479""","""course.teacher""","""T_16368"""
"""C_2344479""","""course.teacher""","""T_16369"""
"""C_2344479""","""course.teacher""","""T_16370"""
"""C_2344479""","""course.teacher""","""T_16371"""


### Course-school

In [7]:
course_school_df = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/course_school_filtered.txt',
                                separator='\t', has_header=False, new_columns=['course', 'school'])
course_school_df

course,school
str,str
"""C_584313""","""S_1"""
"""C_584329""","""S_1"""
"""C_584381""","""S_1"""
"""C_597208""","""S_1"""
"""C_597225""","""S_6"""
…,…
"""C_2343056""","""S_97"""
"""C_2343067""","""S_33"""
"""C_2343133""","""S_63"""
"""C_2343522""","""S_12"""


In [8]:
course_school_df = course_school_df.rename({'course': 'h', 'school': 't'}) \
                                    .with_columns(pl.lit('course.school').alias('r')) \
                                    .select(['h', 'r', 't'])

course_school_df

h,r,t
str,str,str
"""C_584313""","""course.school""","""S_1"""
"""C_584329""","""course.school""","""S_1"""
"""C_584381""","""course.school""","""S_1"""
"""C_597208""","""course.school""","""S_1"""
"""C_597225""","""course.school""","""S_6"""
…,…,…
"""C_2343056""","""course.school""","""S_97"""
"""C_2343067""","""course.school""","""S_33"""
"""C_2343133""","""course.school""","""S_63"""
"""C_2343522""","""course.school""","""S_12"""


## Course-field

In [9]:
course_field_df = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/course_field_filtered.txt',
                                separator='\t', has_header=False, new_columns=['course', 'field'])
course_field_df

course,field
str,str
"""C_584313""","""历史学"""
"""C_584313""","""中国语言文学"""
"""C_584329""","""应用经济学"""
"""C_584329""","""数学"""
"""C_584329""","""物理学"""
…,…
"""C_697791""","""计算机科学与技术"""
"""C_682189""","""计算机科学与技术"""
"""C_735157""","""计算机科学与技术"""
"""C_677218""","""计算机科学与技术"""


In [10]:
course_field_df = course_field_df.rename({'course': 'h', 'field': 't'}) \
                                    .with_columns(pl.lit('course.field').alias('r')) \
                                    .select(['h', 'r', 't'])

In [11]:
course_field_df

h,r,t
str,str,str
"""C_584313""","""course.field""","""历史学"""
"""C_584313""","""course.field""","""中国语言文学"""
"""C_584329""","""course.field""","""应用经济学"""
"""C_584329""","""course.field""","""数学"""
"""C_584329""","""course.field""","""物理学"""
…,…,…
"""C_697791""","""course.field""","""计算机科学与技术"""
"""C_682189""","""course.field""","""计算机科学与技术"""
"""C_735157""","""course.field""","""计算机科学与技术"""
"""C_677218""","""course.field""","""计算机科学与技术"""


## Course-concept

In [12]:
course_concept_df = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/course_concept_filtered.txt',
                                separator='\t', has_header=False, new_columns=['concept', 'course'])
course_concept_df

concept,course
str,str
"""K_n号计数器_控制科学与工程""","""C_681460"""
"""K_串行链接_控制科学与工程""","""C_681460"""
"""K_热继电器按钮_控制科学与工程""","""C_681460"""
"""K_谐波污染_控制科学与工程""","""C_681460"""
"""K_偶校验转换_控制科学与工程""","""C_681460"""
…,…
"""K_稳态人均资本_应用经济学""","""C_696675"""
"""K_稳态的人均资本_应用经济学""","""C_696675"""
"""K_资本的黄金律水平_应用经济学""","""C_696675"""
"""K_折旧率_应用经济学""","""C_696675"""


In [13]:
course_concept_df = course_concept_df.select(['course', 'concept'])

In [14]:
course_concept_df = course_concept_df.rename({'course': 'h', 'concept': 't'}) \
                                    .with_columns(pl.lit('course.concept').alias('r')) \
                                    .select(['h', 'r', 't'])

course_concept_df

h,r,t
str,str,str
"""C_681460""","""course.concept""","""K_n号计数器_控制科学与工程"""
"""C_681460""","""course.concept""","""K_串行链接_控制科学与工程"""
"""C_681460""","""course.concept""","""K_热继电器按钮_控制科学与工程"""
"""C_681460""","""course.concept""","""K_谐波污染_控制科学与工程"""
"""C_681460""","""course.concept""","""K_偶校验转换_控制科学与工程"""
…,…,…
"""C_696675""","""course.concept""","""K_稳态人均资本_应用经济学"""
"""C_696675""","""course.concept""","""K_稳态的人均资本_应用经济学"""
"""C_696675""","""course.concept""","""K_资本的黄金律水平_应用经济学"""
"""C_696675""","""course.concept""","""K_折旧率_应用经济学"""


In [15]:
course_topic_df = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/course_topic.txt',
                                separator='\t', has_header=False, new_columns=['course', 'topic'])
course_topic_df

course,topic
str,i64
"""C_584313""",3
"""C_584329""",56
"""C_584381""",46
"""C_597208""",19
"""C_597225""",55
…,…
"""C_2338005""",111
"""C_2338076""",79
"""C_2341259""",15
"""C_2337996""",110


In [16]:
course_topic_df = course_topic_df.with_columns(course_topic_df["topic"].cast(pl.Utf8))
course_topic_df

course,topic
str,str
"""C_584313""","""3"""
"""C_584329""","""56"""
"""C_584381""","""46"""
"""C_597208""","""19"""
"""C_597225""","""55"""
…,…
"""C_2338005""","""111"""
"""C_2338076""","""79"""
"""C_2341259""","""15"""
"""C_2337996""","""110"""


In [17]:
course_topic_df = course_topic_df.rename({'course': 'h', 'topic': 't'}) \
                                    .with_columns(pl.lit('course.topic').alias('r')) \
                                    .select(['h', 'r', 't'])

### Filter invalid courses (not in interactions_n_core.txt) and filter KG

In [20]:
n_core_interactions

user,course,enroll_time
str,str,str
"""U_69""","""C_679390""","""2019-09-23 14:57:00"""
"""U_69""","""C_696994""","""2019-10-12 03:32:49"""
"""U_69""","""C_697791""","""2019-10-14 15:39:05"""
"""U_69""","""C_696911""","""2019-10-21 12:54:12"""
"""U_69""","""C_875624""","""2019-11-05 11:51:59"""
…,…,…
"""U_34712050""","""C_879054""","""2020-10-14 17:52:03"""
"""U_34712050""","""C_696956""","""2020-10-14 19:00:11"""
"""U_34712050""","""C_797404""","""2020-10-14 21:30:56"""
"""U_34712050""","""C_681422""","""2020-12-01 19:43:47"""


In [21]:
valid_courses = set(n_core_interactions['course'])

triplets = pl.concat([course_teacher_df, course_school_df, course_field_df, course_concept_df, course_topic_df]) \
                .filter(pl.col('h').is_in(valid_courses)) \
                .unique()

triplets

h,r,t
str,str,str
"""C_1774978""","""course.concept""","""K_战略的实施_管理科学与工程"""
"""C_682586""","""course.concept""","""K_文案设计师_艺术学"""
"""C_682303""","""course.concept""","""K_劳务外派_海洋科学"""
"""C_677095""","""course.concept""","""K_有限_哲学"""
"""C_680762""","""course.concept""","""K_程序_计算机科学与技术"""
…,…,…
"""C_696976""","""course.concept""","""K_秩序_管理科学与工程"""
"""C_682742""","""course.concept""","""K_会计估计_工商管理"""
"""C_697073""","""course.concept""","""K_购物方式_应用经济学"""
"""C_696674""","""course.concept""","""K_事件处理程序_计算机科学与技术"""


In [22]:
def filter_invalid_relations_and_entities(triplets, min_entities=5, min_rel=25):
    old_size = -1
    while (old_size != triplets.shape[0]):
        old_size = triplets.shape[0]

        # Filter entites
        inter_counts_of_entity = triplets.group_by('t').count()
        valid_entities = inter_counts_of_entity.filter(pl.col('count') >= min_entities)['t']
        triplets = triplets.filter(pl.col('t').is_in(valid_entities))

        # Filter interactions
        inter_counts_of_rel = triplets.group_by('r').count()
        valid_rels = inter_counts_of_rel.filter(pl.col('count') >= min_rel)['r']
        triplets = triplets.filter(pl.col('r').is_in(valid_rels))

        print(f'New size: {triplets.shape[0]}')

    print('================ Valid relations ===============')
    print(triplets['r'].value_counts())

    print('===== # of attribute type in each relation =====')
    rels = set(triplets['r'])

    for rel in rels:
        n_uni_attr = triplets.filter(pl.col('r') == rel)['t'].n_unique()
        print(f'+ {rel}: {n_uni_attr}')
    return triplets

In [23]:
fil_triplets = filter_invalid_relations_and_entities(triplets)

  inter_counts_of_entity = triplets.group_by('t').count()
  inter_counts_of_rel = triplets.group_by('r').count()


New size: 68777
New size: 68777
shape: (5, 2)
┌────────────────┬───────┐
│ r              ┆ count │
│ ---            ┆ ---   │
│ str            ┆ u32   │
╞════════════════╪═══════╡
│ course.topic   ┆ 2410  │
│ course.school  ┆ 2319  │
│ course.field   ┆ 472   │
│ course.teacher ┆ 273   │
│ course.concept ┆ 63303 │
└────────────────┴───────┘
===== # of attribute type in each relation =====
+ course.concept: 7113
+ course.topic: 128
+ course.field: 41
+ course.teacher: 41
+ course.school: 145


In [24]:
fil_triplets

h,r,t
str,str,str
"""C_677095""","""course.concept""","""K_有限_哲学"""
"""C_680762""","""course.concept""","""K_程序_计算机科学与技术"""
"""C_697791""","""course.concept""","""K_对象类型_计算机科学与技术"""
"""C_696651""","""course.concept""","""K_一阶电路_电气工程"""
"""C_1771156""","""course.teacher""","""T_1423"""
…,…,…
"""C_696700""","""course.concept""","""K_怒发冲冠_中国语言文学"""
"""C_696877""","""course.concept""","""K_量子_物理学"""
"""C_735441""","""course.concept""","""K_switch语句_计算机科学与技术"""
"""C_696729""","""course.concept""","""K_正弦量的_电气工程"""


# Mapping

In [None]:
MODEL_DATA_DIR = '/content/drive/MyDrive/Mạng xã hội/FM'
ENTITY_PATH = MODEL_DATA_DIR + '/entity_list.txt'
ITEM_PATH = MODEL_DATA_DIR + '/item_list.txt'
RELATION_PATH = MODEL_DATA_DIR + '/relation_list.txt'
KG_PATH = MODEL_DATA_DIR + '/kg_final.txt'
USER_PATH = MODEL_DATA_DIR + '/user_list.txt'

### Write to entity list, item list, relation list

### items

In [None]:
item_df = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/course_map.csv', has_header=False) \
          .rename({"column_1": "org_id", "column_2": "remap_id"})

In [None]:
item_df.head()

org_id,remap_id
str,i64
"""C_679390""",0
"""C_696994""",1
"""C_697791""",2
"""C_696911""",3
"""C_875624""",4


In [None]:
item_df.write_csv(ITEM_PATH, separator=' ', include_header=True)

### entities

In [None]:
item_df = item_df.select('org_id')

entity_df = fil_triplets.select('t') \
                        .unique() \
                        .rename({'t': 'org_id'})

entity_df = pl.concat([item_df, entity_df]) \
                .with_columns(pl.Series(range(item_df.shape[0] + entity_df.shape[0])).alias('remap_id'))

entity_df

org_id,remap_id
str,i64
"""C_679390""",0
"""C_696994""",1
"""C_697791""",2
"""C_696911""",3
"""C_875624""",4
…,…
"""K_干密度_地质学""",10291
"""S_113""",10292
"""K_RS触发器_电子科学与技术""",10293
"""K_取代_化学""",10294


In [None]:
entity_df.filter(pl.col("org_id") == 'C_1925117')

org_id,remap_id
str,i64
"""C_1925117""",2824


In [None]:
entity_df.write_csv(ENTITY_PATH, separator=' ', include_header=True)

### relations

In [None]:
rel_df = fil_triplets.select('r').unique()
rel_df = rel_df.with_columns(pl.Series(range(rel_df.shape[0])).alias('remap_id')) \
                .rename({'r': 'org_id'})

rel_df

org_id,remap_id
str,i64
"""course.concept""",0
"""course.school""",1
"""course.teacher""",2
"""course.field""",3
"""course.topic""",4


In [None]:
rel_df.write_csv(RELATION_PATH, separator=' ', include_header=True)

### KG

In [None]:
entity_mapping = dict(zip(entity_df['org_id'], entity_df['remap_id']))
rel_mapping = dict(zip(rel_df['org_id'], rel_df['remap_id']))

In [None]:
enc_triplets = fil_triplets.with_columns(
    pl.col('h').replace(entity_mapping).cast(pl.Int32),
    pl.col('r').replace(rel_mapping).cast(pl.Int32),
    pl.col('t').replace(entity_mapping).cast(pl.Int32)
)

enc_triplets

h,r,t
i32,i32,i32
575,0,9706
51,0,4451
92,0,8641
2263,0,3426
331,0,7517
…,…,…
2233,4,8303
1459,0,7437
453,0,8729
484,0,5358


In [None]:
enc_triplets.write_csv(KG_PATH, separator=' ', include_header=False)

### user

In [None]:
user_df = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/user_map.csv', has_header=False) \
          .rename({"column_1": "org_id", "column_2": "remap_id"})

user_df

org_id,remap_id
str,i64
"""U_69""",0
"""U_90""",1
"""U_105""",2
"""U_112""",3
"""U_172""",4
…,…
"""U_34711787""",99965
"""U_34711839""",99966
"""U_34711954""",99967
"""U_34711985""",99968


In [None]:
user_df.write_csv(USER_PATH, separator=' ', include_header=True)

### Interactions

In [None]:
entity_mapping = dict(zip(entity_df['org_id'], entity_df['remap_id']))
user_mapping = dict(zip(user_df['org_id'], user_df['remap_id']))

In [None]:
sorted_interactions = n_core_interactions \
            .with_columns(
                pl.col('user').replace(user_mapping).cast(pl.Int32),
                pl.col('course').replace(entity_mapping).cast(pl.Int32)
            ) \
            .sort('enroll_time') \
            .select(['user', 'course']) \
            .group_by('user') \
            .agg(pl.col('course')) \
            .rename({'course': 'course_order'}) \
            .sort('user')

sorted_interactions

user,course_order
i32,list[i32]
0,"[0, 1, … 4]"
1,"[5, 6, … 8]"
2,"[9, 10, … 31]"
3,"[1, 32, … 76]"
4,"[65, 73, … 90]"
…,…
99965,"[137, 435, … 628]"
99966,"[887, 355, … 137]"
99967,"[1927, 202, … 1867]"
99968,"[2115, 1269, … 847]"


# Statistics of knowledge graph

In [None]:
rel_df = pl.read_csv(os.path.join(MODEL_DATA_DIR, 'relation_list.txt'), separator=' ')
rel_mapping = dict(zip(rel_df['remap_id'], rel_df['org_id']))
rel_mapping

{0: 'course.concept',
 1: 'course.school',
 2: 'course.teacher',
 3: 'course.field'}

In [None]:
kg_data = pl.read_csv(os.path.join(MODEL_DATA_DIR, 'kg_final.txt'), separator=' ',
                      has_header=False, new_columns=['h', 'r', 't'])
kg_data.select(['r', 't']).group_by('r').n_unique().with_columns(pl.col('r').replace(rel_mapping))

r,t
str,u32
"""course.field""",41
"""course.concept…",7162
"""course.school""",144
"""course.teacher…",40


# Add user gender info to user_list.txt

In [None]:
import shutil
shutil.copyfile(os.path.join(MODEL_DATA_DIR, 'user_list.txt'),
                os.path.join(MODEL_DATA_DIR, 'user_list_without_gender.txt'))

In [None]:
user_df = pl.read_ndjson('/content/drive/MyDrive/Mạng xã hội/FM/user_filtered.json') \
            .select(['id', 'gender'])
gender_mapping = dict(zip(user_df['id'], user_df['gender']))
user_list = pl.read_csv(os.path.join(MODEL_DATA_DIR, 'user_list_without_gender.txt'),
                      separator=' ')

In [None]:
user_list = user_list.rename({'org_id': 'id'})
user_list = user_list.join(user_df, how='inner', on='id')

user_list

id,remap_id,gender
str,i64,f64
"""U_69""",0,1.0
"""U_90""",1,0.0
"""U_105""",2,1.0
"""U_112""",3,1.0
"""U_172""",4,1.0
…,…,…
"""U_34711787""",99965,2.0
"""U_34711839""",99966,1.0
"""U_34711954""",99967,2.0
"""U_34711985""",99968,0.0


In [None]:
user_list.head()

id,remap_id,gender
str,i64,i64
"""U_69""",0,1
"""U_90""",1,0
"""U_105""",2,1
"""U_112""",3,1
"""U_172""",4,1


In [None]:
user_list = user_list.rename({'org_id': 'id'})
user_list = user_list.join(user_df, how='inner', on='id')

In [None]:
user_list.shape

(99970, 3)

In [None]:
user_list = user_list.with_columns(
    pl.col("gender").cast(pl.Int64)
)

In [None]:
user_list.write_csv(os.path.join(MODEL_DATA_DIR, 'user_list.txt'), separator=' ')

In [None]:
user_list = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/user_list_with_gender.txt', separator=' ')
user_list_with_community = pl.read_csv('/content/drive/MyDrive/Mạng xã hội/FM/user_communities.csv')

In [None]:
user_list = user_list.join(user_list_with_community, how='inner', left_on='remap_id',
                          right_on='id')

In [None]:
user_list = user_list.drop("")

In [None]:
user_list.write_csv('/content/drive/MyDrive/Mạng xã hội/FM/user_list.txt',  separator=' ')