In [55]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [56]:
def merge_modules(df, module_sets, new_code):
    all_modules_to_merge = set().union(*module_sets)  # 合并所有的模块集合
    df['Module Code'] = df['Module Code'].apply(lambda x: new_code if x in all_modules_to_merge else x)
    df = df.drop_duplicates()  # 去除重复行
    return df

In [57]:
# 课程数据筛选
df_template = pd.read_excel("Merged_Input.xlsx", sheet_name = "Sheet1")
df_template.head()

# 只关注['Department', 'Activity Type', 'Module Code', 'Activity Counts']
df_template = df_template[['Department', 'Module Code', 'Activity Type', 'Activity Counts', 'Module Size', 'Duration']]
df_template.drop_duplicates(inplace=True)

df_template['Duration'] = df_template['Duration'].str.rstrip('h').astype(float) * 60

df_template.to_excel('temp.xlsx', index=False)

# 剔除所有PED、CEC、MED、MUS开头的课程
df_template = df_template[~df_template['Module Code'].str.startswith(('PED', 'CEC', 'MED', 'MUS'))]
# df_template = df_template[~(df_template['Module Code'].str.startswith('GF') & (df_template['Activity Type'] == 'LEC'))]

# 只关注Activity Type为非tut的（除GE/GF/CHI课程）
df_template_not_tut = df_template[
    (df_template['Activity Type'] != 'TUT') |
    df_template['Module Code'].str.startswith(('GE', 'GF', 'CHI'))
]

max_duration = df_template_not_tut.groupby(['Module Code', 'Activity Type'])['Duration'].transform('max')
df_template_not_tut.loc[:, 'Duration'] = max_duration

df_template_not_tut = df_template_not_tut.groupby(['Module Code', 'Activity Type'], as_index=False).agg({
    'Department': 'first',
    'Activity Type': 'first',
    'Activity Counts': 'first',
    'Module Size': 'first',
    'Duration': 'first',
})

# 所有tut（里面包括了白天的tut）
df_template_tut = df_template[df_template["Activity Type"] == "TUT"] 

# 创建一个副本用于直接对表进行处理
df_template_not_tut_c = df_template_not_tut.copy()


In [58]:
df_template_not_tut

Unnamed: 0,Module Code,Department,Activity Type,Activity Counts,Module Size,Duration
0,ACT2111,SME,LEC,1,120,90.0
1,ACT3011,SME,LEC,3,300,90.0
2,ACT3121,SME,LEC,3,300,90.0
3,ACT3131,SME,LEC,2,160,90.0
4,ACT3141,SME,LEC,2,160,90.0
...,...,...,...,...,...,...
282,TRA4030,SHSS,LEC,1,50,180.0
283,URM2110,SHSS,LEC,1,40,90.0
284,URM3010,SHSS,LEC,1,40,120.0
285,URM3110,SHSS,LEC,1,40,90.0


In [59]:
"""# 根据Module分组，并打上Mixed标签
for module, group in df_template_not_tut_c.groupby('Module Code'):
    # 标记Lecture与Tutorial的索引
    is_lecture = group['Activity Type'] == 'LEC'
    is_tutorial = group['Activity Type'] == 'TUT'
    
    lectures = group[is_lecture]
    tutorials = group[is_tutorial]

    if not tutorials.empty:
        
        # 对于带有TUT的Lec：
        if not module.startswith('GF'):
            # 将Lecture改为Mixed
            df_template_not_tut_c.loc[group.index[is_lecture], 'Activity Type'] = 'Mixed'
            
            # 获取Lecture行的Activity Counts
            lecture_counts = lectures['Activity Counts'].iloc[0]
            
            # 将对应Tutorial所在行的Activity Counts中减去Lecture行中的Activity Counts
            df_template_not_tut_c.loc[group.index[is_tutorial], 'Activity Counts'] -= lecture_counts
        
        # 将剩余Tutorial两两捆绑
        tutorial_counts = df_template_not_tut_c[(df_template_not_tut_c['Module Code'] == module) & is_tutorial]['Activity Counts']
        paired_tutorials = (tutorial_counts + 1) // 2
        
        # 更新Tutorial行的Activity Type为Mixed
        df_template_not_tut_c.loc[(df_template_not_tut_c['Module Code'] == module) & is_tutorial, 'Activity Type'] = 'Mixed'
        
        # 更新Tutorial行的Activity Counts
        df_template_not_tut_c.loc[(df_template_not_tut_c['Module Code'] == module) & is_tutorial, 'Activity Counts'] = paired_tutorials.values
        
        # 移除减去后Counts变为0的行
        df_template_not_tut_c = df_template_not_tut_c[~((df_template_not_tut_c['Module Code'] == module) & (df_template_not_tut_c['Activity Counts'] == 0))]"""

# 将Module Size进行归一化处理，然后按照Activity Counts对权重进行均分
scaler = MinMaxScaler(feature_range=(0.00001, 1))
df_template_not_tut_c['Normalized Weight'] = scaler.fit_transform(df_template_not_tut_c['Module Size'].values.reshape(-1, 1))
df_template_not_tut_c['Normalized Weight'] = df_template_not_tut_c.apply(lambda r: r['Normalized Weight'] / r['Activity Counts'], axis = 1)

In [60]:
# 对Activity Counts合并
df_template_not_tut_c = df_template_not_tut_c.groupby(['Module Code', 'Activity Type'], as_index=False).agg({
    'Department': 'first',
    'Activity Type': 'first',
    'Normalized Weight': 'first',
    'Duration': 'first',
    'Activity Counts': 'sum',   
})

df_template_not_tut = df_template_not_tut_c.copy()

# 制表
df_template_not_tut.to_excel("全部非TUT Course(from teample).xlsx", index=False)
df_template_tut.to_excel("全部TUT Course.xlsx", index=False)

df_template_not_tut.head()

Unnamed: 0,Module Code,Department,Activity Type,Normalized Weight,Duration,Activity Counts
0,ACT2111,SME,LEC,0.072617,90.0,1
1,ACT3011,SME,LEC,0.063809,90.0,3
2,ACT3121,SME,LEC,0.063809,90.0,3
3,ACT3131,SME,LEC,0.049509,90.0,2
4,ACT3141,SME,LEC,0.049509,90.0,2


In [61]:
#目标：把非TUT的课程按照Activity Counts分割

extend_df_not_tut = pd.DataFrame()

# 遍历activities数量大于一的course,保存分裂后的数据到extend_df

for index, row in df_template_not_tut.iterrows():
    #Department Activity Type Module Code  Activity Counts
    department = row['Department'] #不变
    activity_type = row['Activity Type'] #不变
    module_code = row['Module Code']
    activity_counts = row['Activity Counts']
    module_weight = row['Normalized Weight']
    duration = row['Duration']
    
# 根据Activity Counts的数值将Module Code分裂成若干门独立的课程
    
    for i in range(activity_counts):
        new_module_code = f"{module_code}_{i+1:02d}"  # 使用02d格式化为两位数
        new_row = {'Department': department, 'Activity Type': activity_type, 'Module Code': new_module_code, 'Activity Counts': 1, 'Normalized Weight': module_weight, 'Duration': duration}
        extend_df_not_tut = pd.concat([extend_df_not_tut, pd.DataFrame([new_row])], ignore_index=True)
    
extend_df_not_tut.to_excel("全部非TUT Activity.xlsx", index=False)
extend_df_not_tut.head()

Unnamed: 0,Department,Activity Type,Module Code,Activity Counts,Normalized Weight,Duration
0,SME,LEC,ACT2111_01,1,0.072617,90.0
1,SME,LEC,ACT3011_01,1,0.063809,90.0
2,SME,LEC,ACT3011_02,1,0.063809,90.0
3,SME,LEC,ACT3011_03,1,0.063809,90.0
4,SME,LEC,ACT3121_01,1,0.063809,90.0


In [62]:
# 合并df_template_not_tut 和 df_POS, 根据 'Module Code' 进行匹配；将ENG，ITE，CHI课程类别调整为Optional。

# 剔除POS中PED、CEC开头的课程
df_temp = pd.read_excel('POS-2420.xlsx', sheet_name = "POS-Confirmed") 
df_POS = df_temp[~df_temp['Module Code'].str.startswith(('PED', 'CEC', 'MED', 'MUS'))]
df_POS = df_POS[['Department', 'POS', 'Module Code', 'Type']]
# df_POS = merge_modules(df_POS, modules_to_merge, 'LNG1000')

"""# 先将POS中的GE课程聚合成GEBCD POS里没有GEBCE聚合个啥
prefixes = ['GEB', 'GEC', 'GED']

aggregated_rows = []

for prefix in prefixes:
    prefix_rows = df_POS[df_POS['Module Code'].str.startswith(prefix)]
    print(prefix_rows)
    if not prefix_rows.empty:
        aggregated_row = pd.DataFrame({
            'Department': 'HSS',
            'POS': [prefix],
            'Type': 'Mandatory',
            'Module Code': [prefix],
        })
        aggregated_rows.append(aggregated_row)

aggregated_df = pd.concat(aggregated_rows, ignore_index=True)

df_remaining = df_POS[~df_POS['Module Code'].str.startswith(tuple(prefixes))]

columns_to_keep = df_remaining.columns

df_POS = pd.concat([df_remaining, aggregated_df], ignore_index=True)

df_POS = df_POS.reindex(columns=columns_to_keep)

df_POS = df_POS.sort_values(by='Module Code')

df_POS = df_POS.reset_index(drop=True)

df_POS.to_excel("POS-Confirmed.xlsx", index=False)"""

# 按照 'Module Code' 合并df_template_not_tut以及df_POS
df_combined = pd.merge(
    df_POS, 
    df_template_not_tut[['Module Code', 'Activity Type', 'Activity Counts', 'Normalized Weight', 'Duration']], 
    on='Module Code', 
    how='outer',
)

df_combined.to_excel("temp.xlsx",index=False)

rows_with_nulls = df_combined[['Type', 'Department', 'POS']].isna().any(axis=1)
df_combined.loc[rows_with_nulls, 'Type'] = 'Mandatory'
df_combined.loc[rows_with_nulls, 'Department'] = 'SPE'
df_combined.loc[rows_with_nulls, 'POS'] = df_combined['Module Code']

rows_with_null_type = df_combined['Activity Type'].isna()
df_combined.loc[rows_with_null_type, 'Activity Type'] = 'Mixed'

rows_with_na = df_combined[df_combined.isna().any(axis=1)]
df_combined = df_combined.dropna()
df_filtered = df_combined[df_combined['Type'] != 'Optional']

df_filtered.to_excel("POS-Course总表.xlsx", index=False)
df_filtered.head()


Unnamed: 0,Department,POS,Module Code,Type,Activity Type,Activity Counts,Normalized Weight,Duration
6,SDS,FE-Year01,ACT2111,Mandatory,LEC,1.0,0.072617,90.0
11,SME,ACT_ADA-Year02,ACT3011,Mandatory,LEC,3.0,0.063809,90.0
14,SME,ACT_AFR-Year02,ACT3011,Mandatory,LEC,3.0,0.063809,90.0
19,SME,ACT_ADA-Year02,ACT3121,Mandatory,LEC,3.0,0.063809,90.0
22,SME,ACT_AFR-Year02,ACT3121,Mandatory,LEC,3.0,0.063809,90.0


In [63]:
students_df = pd.read_excel('分裂后的学生mandatory数据.xlsx')
pos_course_df = pd.read_excel('POS-Course总表.xlsx')

students_df.rename(columns={'Modules': 'Module Code'}, inplace=True)
merged_df = pd.merge(students_df, pos_course_df, on='Module Code', how='left')

print(merged_df.columns)

final_df = merged_df[['Department', 'POS_x', 'Module Code', 'Type', 'Activity Type', 'Activity Counts', 'Normalized Weight', 'Duration', 'Count']]
final_df.rename(columns={'POS_x': 'POS'}, inplace=True)
final_df.drop_duplicates(inplace=True)
final_df.dropna(inplace=True)

final_df.to_excel('POS-Course总表(SubPOS).xlsx', index=False)

Index(['POS_x', 'Module Code', 'Count', 'Department', 'POS_y', 'Type',
       'Activity Type', 'Activity Counts', 'Normalized Weight', 'Duration'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={'POS_x': 'POS'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)


In [64]:
# df_POS_course_mandatory = pd.concat([df_POS_course_mandatory_temp, df_POS_course_special_temp], ignore_index=True)
df_POS_course_mandatory = pd.read_excel('POS-Course总表(SubPOS).xlsx')

# none_optional_module_codes = df_POS_course_mandatory['Module Code'].unique()


df_POS_course_mandatory = df_POS_course_mandatory.dropna()
# df_POS_course_optional = df_POS_course_optional.dropna()

#目标：把非TUT的课程按照Activity Counts分割

def extend_dataframe(df_POS_course):
    # 初始化一个新的DataFrame来存储扩展后的数据
    extend_df_POS_course = pd.DataFrame(columns=df_POS_course.columns)
    
    empty_frame = False
    
    for Index, Row in df_POS_course.iterrows():
        Department = Row['Department']
        POS = Row['POS']
        Type = Row['Type']
        Module_code = Row['Module Code']
        Activity_type = Row['Activity Type']
        Activity_counts = Row['Activity Counts']
        Module_weight = Row['Normalized Weight']
        Duration = Row['Duration']
        
        # 根据Activity Counts的数值将Module Code分裂成若干门独立的课程
        if Activity_counts > 1:
            for I in range(Activity_counts):
                New_module_code = f"{Module_code}_{I+1:02d}"  # 使用02d格式化为两位数
                New_row = {
                    'Department': Department, 
                    'POS': POS, 
                    'Type': Type, 
                    'Module Code': New_module_code, 
                    'Activity Type': Activity_type,  
                    'Activity Counts': 1,
                    'Normalized Weight': Module_weight,
                    'Duration': Duration
                }
                """
                if not empty_frame:
                    extend_df_POS_course = pd.DataFrame([New_row])
                    empty_frame = True
                else:"""
                extend_df_POS_course = pd.concat([extend_df_POS_course, pd.DataFrame([New_row])], ignore_index=True)
        else:
            extend_df_POS_course = pd.concat([extend_df_POS_course, pd.DataFrame([Row])], ignore_index=True)
    
    return extend_df_POS_course

extend_df_POS_course_mandatory = extend_dataframe(df_POS_course_mandatory)
"""extend_df_POS_course_optional = extend_dataframe(df_POS_course_optional)
extend_df_POS_course_total = pd.concat([extend_df_POS_course_mandatory, extend_df_POS_course_optional], ignore_index=True)"""

extend_df_POS_course_mandatory.to_excel('POS-Activity总表.xlsx', index=False)


  extend_df_POS_course = pd.concat([extend_df_POS_course, pd.DataFrame([New_row])], ignore_index=True)


In [65]:
extend_df_POS_course_mandatory.head()

Unnamed: 0,Department,POS,Module Code,Type,Activity Type,Activity Counts,Normalized Weight,Duration,Count
0,SME,ACT_ADA-Year02_01,ACT3011_01,Mandatory,LEC,1,0.063809,90,
1,SME,ACT_ADA-Year02_01,ACT3011_02,Mandatory,LEC,1,0.063809,90,
2,SME,ACT_ADA-Year02_01,ACT3011_03,Mandatory,LEC,1,0.063809,90,
3,SME,ACT_ADA-Year02_01,ACT3121_01,Mandatory,LEC,1,0.063809,90,
4,SME,ACT_ADA-Year02_01,ACT3121_02,Mandatory,LEC,1,0.063809,90,


In [66]:
# extend_df_POS_course_optional.head()