In [40]:
import pandas as pd

df_POS = pd.read_excel("POS-2420_02.xlsx", sheet_name = "POS-Confirmed")
df_student = pd.read_excel("Student&Mandatory Courses.xlsx", sheet_name = "Sheet1")

df_student = df_student.dropna()
df_student = df_student[df_student["Modules"] != "InternationalStudent"]
df_student = df_student[~df_student["Modules"].str.startswith(('PED', 'CEC', 'MED', 'MUS', 'ITE'))]

df_POS = df_POS[~df_POS["Module Code"].str.startswith(('PED', 'CEC', 'MED', 'MUS', 'ITE'))]

df_POS_Mandatory = df_POS[df_POS["Type"] == "Mandatory"]
df_result = pd.DataFrame(columns = ['POS', 'Modules', 'Count'])

modules_by_pos = df_POS_Mandatory.groupby('POS')['Module Code'].apply(list).to_dict()
POS = list(modules_by_pos.keys())

df_student_filtered = df_student[df_student.apply(lambda row: row['Modules'] in modules_by_pos.get(row['Programmes of Study'], []), axis=1)]

df_student_filtered.to_excel("test.xlsx", index=False)

modules_by_name = df_student_filtered.groupby('Name').apply(
    lambda x: {
        'POS': ', '.join(sorted(x['Programmes of Study'].unique().astype(str))),
        'Modules': ', '.join(sorted(x['Modules'].unique().astype(str)))
    },
    include_groups=False
).to_dict()

df_modules_by_name = pd.DataFrame.from_dict(modules_by_name, orient='index').reset_index()
df_modules_by_name.columns = ['Name', 'POS', 'Modules']

# 保存为 Excel 文件
df_modules_by_name.to_excel("modules_by_name_with_POS.xlsx", index=False)

In [41]:
df_modules_by_name

Unnamed: 0,Name,POS,Modules
0,117010240,ENER_NEE-Year04,"ECE3001, ENE4007, ENE4009, STA2002"
1,117010271,BIFC-Year04,"BIM3009, BIM3011, BIM3019, BIO2004, CHM2004"
2,117020456,GBM-Year04,"MGT3780, MGT4030"
3,118010020,CSE_AI-Year04,"CSC4010, CSC4120, CSC4303, DDA4220"
4,118010318,ORG_PSY-Year04,"ENG2002H, HSS1004, PSY3030, PSY3110, PSY3140, ..."
...,...,...,...
6449,124090993,BENG_COM,"CSC1002, CSC1004, CSC1006, ENG1002, MAT1002, M..."
6450,124090994,BSSDS_COM,"CSC1002, CSC1004, DDA2001, ENG1002, MAT1002, M..."
6451,124090995,BENG_COM,"CSC1002, CSC1004, CSC1006, ENG1002, MAT1002, M..."
6452,124090996,BENG_COM,"CSC1002, CSC1004, CSC1006, ENG1002, MAT1002, M..."


In [42]:
df_result = pd.DataFrame()  # 初始化结果DataFrame

# 首先按照您的要求聚合数据
for pos in POS:
    df_student_pos = df_modules_by_name[df_modules_by_name["POS"] == pos]  # 只处理这个专业的学生
    
    df_aggregated = df_student_pos.groupby(['POS', 'Modules']).agg({'Name': 'count'}).reset_index()
    df_aggregated.columns = ['POS', 'Modules', 'Count']
    
    df_result = pd.concat([df_result, df_aggregated], ignore_index=True)

# 创建新的sub-POS列并填充数据
for pos in POS:
    pos_df = df_result[df_result['POS'] == pos].copy()
    pos_df.sort_values(by='Count', ascending=False, inplace=True)
    pos_df['sub-POS'] = [f"{pos}_{str(i+1).zfill(2)}" for i in range(len(pos_df))]  # 创建sub-POS列
    
    # 直接将 sub-POS 列添加到 df_result 中
    df_result.loc[pos_df.index, 'sub-POS'] = pos_df['sub-POS']

print(df_result.columns)

# 确保df_result中包含正确的列
df_result = df_result[['POS', 'Modules', 'Count', 'sub-POS']]  # 添加新列到输出中

# 按照POS和sub-POS排序
df_result.sort_values(by=['sub-POS'], inplace=True)

# 将结果保存为Excel文件
df_result.to_excel("学生mandatory数据.xlsx", index=False)

Index(['POS', 'Modules', 'Count', 'sub-POS'], dtype='object')


In [43]:
df = pd.read_excel("学生mandatory数据.xlsx")

# 定义需要检查的语言课程集合
language_courses = {"SPN1002", "FRN1002", "JAP1002", "KOR1002"}

# 处理每个单元格的函数
def process_modules(modules):
    if pd.isnull(modules):
        return modules  # 如果单元格为空，直接返回
    
    # 将单元格内容按逗号分隔并转换为集合
    courses = set(modules.split(", "))
    
    # 检查是否包含全部四门语言课程
    if language_courses.issubset(courses):
        # 如果包含全部四门课程，则替换为 'LNG1000'
        # courses = (courses - language_courses) | {"LNG1000"}
        courses = (courses - language_courses) | {"LNG1000"}
    else:
        # 如果只包含部分语言课程，则删除这些课程
        courses = courses - language_courses
    
    # 将结果集合转换回字符串并返回
    return ", ".join(sorted(courses))

# 应用函数处理 'Modules' 列
if 'Modules' in df.columns:
    df['Modules'] = df['Modules'].apply(process_modules)
else:
    print("数据中不存在 'Modules' 列")

In [44]:
if 'Modules' in df.columns:
    # 假设课程名称在 'Modules' 列中以逗号分隔
    df['course counts'] = df['Modules'].apply(lambda x: len(set(x.split(','))) if pd.notnull(x) else 0)
else:
    print("数据中不存在 'Modules' 列")

df = df.dropna()

# 保存更新后的文件
df.to_excel('更新后的学生mandatory数据.xlsx', index=False)
print("已生成包含课程统计的文件 '更新后的学生mandatory数据.xlsx'")

已生成包含课程统计的文件 '更新后的学生mandatory数据.xlsx'


In [45]:
df_split = df.copy()
split_list = []

for _, row in df.iterrows():
    modules = row['Modules'].split(', ')
    for module in modules:
        split_list.append({
            'POS': row['POS'],
            'sub_POS': row['sub-POS'],
            'Modules': module,
            'Count': row['Count']
        })

result_df = pd.DataFrame(split_list)
result_df.to_excel('分裂后的学生mandatory数据.xlsx', index=False)