<a href="https://colab.research.google.com/github/FadQode/skripsi-modelling/blob/main/dataset-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
##### Data Loading & Understanding ########
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')


# Data Loading and Preparation

## Data Loading

In [2]:
DATASET_PATH_1 = '../../dataset/data2_fad.xlsx'
DATASET_PATH_2 = '../../dataset/data2_shafa.xlsx'
DATASET_PATH_3 = '../../dataset/data_subcpmk_fadhil.xlsx'
DATASET_PATH_4 = '../../dataset/data_subcpmk_shafa.xlsx'

In [3]:
df_1 = pd.read_excel(DATASET_PATH_1, skiprows=1)
df_2 = pd.read_excel(DATASET_PATH_2, skiprows=1)
df_3 = pd.read_excel(DATASET_PATH_3)
df_4 = pd.read_excel(DATASET_PATH_4)

df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

df.drop(columns=["Unnamed: 5", "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10"], inplace=True)

df.head()

Unnamed: 0,nomor,level_cpmk,cpmk,level_subcpmk,subcpmk
0,5,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...
1,6,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C2,Menyelesaikan logic functions and gates
2,7,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C5,Mengevaluasi boolean algebra dan combinational...
3,40,C6,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk
4,41,C6,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1288 entries, 0 to 1287
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   nomor          1288 non-null   int64 
 1   level_cpmk     1288 non-null   object
 2   cpmk           1288 non-null   object
 3   level_subcpmk  1288 non-null   object
 4   subcpmk        1288 non-null   object
dtypes: int64(1), object(4)
memory usage: 50.4+ KB


## Cleaning Dataset

In [5]:
df[df.isna().any(axis=1)]

df.dropna(inplace=True)

In [6]:
df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]] = df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]].apply(lambda x: x.astype(str).str.replace('`','', regex=False).str.strip())
df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]] = df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]].apply(lambda x: x.str.strip())
df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]].head()


Unnamed: 0,cpmk,level_cpmk,subcpmk,level_subcpmk
0,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...,C6
1,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Menyelesaikan logic functions and gates,C2
2,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Mengevaluasi boolean algebra dan combinational...,C5
3,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk,C6
4,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk,C6


In [7]:
df["cpmk"].nunique(), df["subcpmk"].nunique()

(657, 1271)

In [8]:
df["level_cpmk"].str[0]

0       C
1       C
2       C
3       C
4       C
       ..
1283    P
1284    C
1285    C
1286    A
1287    A
Name: level_cpmk, Length: 1288, dtype: object

In [9]:
df.loc[df["level_cpmk"].str[0] == 'C']

Unnamed: 0,nomor,level_cpmk,cpmk,level_subcpmk,subcpmk
0,5,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...
1,6,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C2,Menyelesaikan logic functions and gates
2,7,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C5,Mengevaluasi boolean algebra dan combinational...
3,40,C6,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk
4,41,C6,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk
...,...,...,...,...,...
1265,5971,C3,Mampu melakukan pencarian data di database sec...,C3,Mampu melakukan pencarian data di database online
1274,6076,C2,Mahasiswa mampu memahami konsep dasar komputer...,C2,Mahasiswa mampu mendeskripsikan konsep dasar k...
1280,646,C4,"Mampu menganalisis masalah kontekstual PKn, me...",A5,Mampu menunjukan pelaksanaan pendidikan anti k...
1284,2063,C4,"Mampu menganalisis masalah kontekstual PKn, me...",A5,Mampu menunjukan pelaksanaan penegakan hukum &...


## Alignment Labeling

In [10]:
def alignment_check(row):
    cpmk_domain = row["level_cpmk"][0]
    cpmk_level  = int(row["level_cpmk"][1])

    sub_domain = row["level_subcpmk"][0]
    sub_level  = int(row["level_subcpmk"][1])

    if cpmk_domain != sub_domain:
        return False

    if cpmk_level < sub_level:
        return False

    return True

df["keselarasan"] = df.apply(alignment_check, axis=1)

df.head()

Unnamed: 0,nomor,level_cpmk,cpmk,level_subcpmk,subcpmk,keselarasan
0,5,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...,True
1,6,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C2,Menyelesaikan logic functions and gates,True
2,7,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C5,Mengevaluasi boolean algebra dan combinational...,True
3,40,C6,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk,True
4,41,C6,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk,True


In [11]:
df["keselarasan"].value_counts()


keselarasan
True     878
False    410
Name: count, dtype: int64

In [12]:
vc_sub = df["level_subcpmk"].value_counts()
vc_cpmk = df["level_cpmk"].value_counts()

hasil = vc_sub.add(vc_cpmk, fill_value=0)
hasil


A2      3.0
A3      5.0
A4     78.0
A5     77.0
C1     87.0
C2    491.0
C3    292.0
C4    488.0
C5    211.0
C6    466.0
P2     29.0
P3    224.0
P4    103.0
P5     22.0
Name: count, dtype: float64

## Modelling Dataset 1 - Learning Outcome Text & Bloom Label

In [13]:
# Pecah CPMK menjadi data terpisah
df_cpmk = df[['cpmk']].copy()
df_cpmk['Jenis'] = 'CPMK'
df_cpmk['Level'] = df['level_cpmk']

df_cpmk.rename(columns={'cpmk':'Learning_Outcome'}, inplace=True)

# Pecah Sub-CPMK
df_sub = df[['subcpmk']].copy()
df_sub['Jenis'] = 'Sub-CPMK'
df_sub['Level'] = df['level_subcpmk']

df_sub.rename(columns={'subcpmk':'Learning_Outcome',
                       'level_subcpmk':'level_cpmk'}, inplace=True)

# Gabungkan
df_learning_outcome = pd.concat([df_cpmk, df_sub], ignore_index=True)
df_learning_outcome




Unnamed: 0,Learning_Outcome,Jenis,Level
0,"Mampu merencanakan, menyelesaikan, dan mengeva...",CPMK,C6
1,"Mampu merencanakan, menyelesaikan, dan mengeva...",CPMK,C6
2,"Mampu merencanakan, menyelesaikan, dan mengeva...",CPMK,C6
3,Mampu merancang gardu induk,CPMK,C6
4,Mampu merancang gardu induk,CPMK,C6
...,...,...,...
2571,Mahasiswa dapat memahami dan mempraktekkan iba...,Sub-CPMK,P3
2572,Mampu menunjukan pelaksanaan penegakan hukum &...,Sub-CPMK,A5
2573,Mahasiswa mampu menunjukkan implementasi nilai...,Sub-CPMK,A5
2574,Mampu bertindak mengimplementasikan makna Sila...,Sub-CPMK,A5


In [14]:
df_learning_outcome.to_csv('../../dataset/learning_outcomes.csv', index=False)


In [15]:
df_learning_outcome['Learning_Outcome'].nunique()

1905

In [16]:
df_learning_outcome["Level"].str[0]
print(df_learning_outcome.loc[df_learning_outcome["Level"].str[0] == 'C'].value_counts().sum())
print(df_learning_outcome.loc[df_learning_outcome["Level"].str[0] == 'P'].value_counts().sum())
print(df_learning_outcome.loc[df_learning_outcome["Level"].str[0] == 'A'].value_counts().sum())



2035
378
163


In [26]:
counts = (
    df_learning_outcome["Level"]
    .str[0]                      # ambil huruf depan: C / P / A
    .value_counts()
    .reindex(["C", "P", "A"], fill_value=0)
    .reset_index()
)

counts.columns = ["Domain", "Jumlah"]

fig = px.pie(
    counts,
    names="Domain",
    values="Jumlah",
    title="Proporsi Domain Taksonomi Bloom pada Learning Outcome",
    hole=0.4
)

fig.update_traces(textinfo="value+label")
fig.show()


In [18]:
print(df_learning_outcome.loc[df_learning_outcome["Level"].str[0] == 'C'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"].str[0] == 'P'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"].str[0] == 'A'].nunique())

Learning_Outcome    1528
Jenis                  2
Level                  6
dtype: int64
Learning_Outcome    280
Jenis                 2
Level                 4
dtype: int64
Learning_Outcome    107
Jenis                 2
Level                 4
dtype: int64


In [29]:
import plotly.express as px
import pandas as pd

# Hitung nunique per domain
data = {
    "Domain": ["Kognitif", "Psikomotor", "Afektif"],
    "Jumlah Unik": [
        df_learning_outcome.loc[df_learning_outcome["Level"].str[0] == "C"].nunique().max(),
        df_learning_outcome.loc[df_learning_outcome["Level"].str[0] == "P"].nunique().max(),
        df_learning_outcome.loc[df_learning_outcome["Level"].str[0] == "A"].nunique().max(),
    ]
}

df_unique = pd.DataFrame(data)

fig = px.pie(
    df_unique,
    names="Domain",
    values="Jumlah Unik",
    title="Jumlah Nilai Unik Learning Outcome Berdasarkan Domain Taksonomi Bloom",
    hole=0.4
)

fig.update_traces(textposition="outside", textinfo="value+label")


fig.show()


In [30]:
df_sample = (
    df_learning_outcome
    .assign(Domain=df_learning_outcome["Level"].str[0])
    .groupby("Domain", group_keys=False)
    .apply(lambda x: x.sample(n=min(2, len(x)), random_state=42))
)

df_sample

Unnamed: 0,Learning_Outcome,Jenis,Level,Domain
2111,Menggali potensi diri untuk berwirausaha dan m...,Sub-CPMK,A5,A
1608,Mengintegrasikan nilai-nilai etika komunikasi ...,Sub-CPMK,A4,A
729,Mahasiswa mampu mengevaluasi transfer aset tid...,CPMK,C5,C
117,Mahasiswa mampu menelaah kompetensi dan tupoks...,CPMK,C4,C
2148,Mahasiswa mampu membuka dan menutup pelajaran ...,Sub-CPMK,P3,P
2376,Mampu mempraktekkan prinsip makanan oral (bent...,Sub-CPMK,P3,P


In [19]:
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'A1'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'A2'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'A3'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'A4'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'A5'].nunique())



Learning_Outcome    0
Jenis               0
Level               0
dtype: int64
Learning_Outcome    3
Jenis               1
Level               1
dtype: int64
Learning_Outcome    5
Jenis               1
Level               1
dtype: int64
Learning_Outcome    55
Jenis                2
Level                1
dtype: int64
Learning_Outcome    46
Jenis                2
Level                1
dtype: int64


In [20]:
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'P1'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'P2'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'P3'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'P4'].nunique())
print(df_learning_outcome.loc[df_learning_outcome["Level"] == 'P5'].nunique())

Learning_Outcome    0
Jenis               0
Level               0
dtype: int64
Learning_Outcome    20
Jenis                2
Level                1
dtype: int64
Learning_Outcome    169
Jenis                 2
Level                 1
dtype: int64
Learning_Outcome    74
Jenis                2
Level                1
dtype: int64
Learning_Outcome    17
Jenis                2
Level                1
dtype: int64


In [51]:
df_learning_outcome["Level"].value_counts()

fig = px.bar(
    df_learning_outcome["Level"].value_counts().reset_index().rename(columns={
        "count": "Jumlah",
        "Level": "Level"
    }),
    x="Jumlah",
    y="Level",
    title="Distribusi Learning Outcome berdasarkan Level Taksonomi Bloom",
    text="Jumlah"
)

fig.update_traces(textposition="outside",  texttemplate="%{x}")
fig.show()


## Modelling Dataset 2 - Learning - Sub-Learning  & Alignment Label

In [22]:
df_pair =   df[['cpmk', 'level_cpmk', 'subcpmk', 'level_subcpmk', 'keselarasan']].copy()
df_pair

Unnamed: 0,cpmk,level_cpmk,subcpmk,level_subcpmk,keselarasan
0,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...,C6,True
1,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Menyelesaikan logic functions and gates,C2,True
2,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Mengevaluasi boolean algebra dan combinational...,C5,True
3,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk,C6,True
4,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk,C6,True
...,...,...,...,...,...
1283,"Mahasiswa mampu mempraktekkan haji, umroh dan ...",P3,Mahasiswa dapat memahami dan mempraktekkan iba...,P3,True
1284,"Mampu menganalisis masalah kontekstual PKn, me...",C4,Mampu menunjukan pelaksanaan penegakan hukum &...,A5,False
1285,Mahasiswa mampu menafsirkan dan menerapkan nil...,C3,Mahasiswa mampu menunjukkan implementasi nilai...,A5,False
1286,"Mampu menunjukan nilai ketuhanan, nilai kemanu...",A5,Mampu bertindak mengimplementasikan makna Sila...,A5,True


In [23]:
df_pair.to_csv('../../dataset/cpmk_subcpmk_pairs.csv', index=False)


In [34]:
df_pair["keselarasan"] = df_pair["keselarasan"].map({
    True: "Selaras",
    False: "Tidak Selaras"
})

df_pair_sample = (
    df_pair
    .apply(lambda x: x.sample(n=min(4, len(x)), random_state=42))
)

df_pair_sample

Unnamed: 0,cpmk,level_cpmk,subcpmk,level_subcpmk,keselarasan
1120,Mahasiswa mampu mempraktikan dan analisis terh...,C4,Mahasiswa mampu mengidentifikasi profil tanah...,C1,Selaras
1185,Mahasiswa mampu mempraktikkan proses rekrutmen...,P3,Mahasiswa melakukan evaluasi terhadap proses s...,C5,Tidak Selaras
462,Mampu menyajikan serta mengolah data secara il...,C3,Mampu mengolah data secara ilmiah,C3,Selaras
81,Mahasiswa mampu melakukan analisis gender dan ...,C4,Mahasiswa mampu menjelaskan langkah analisis p...,C2,Selaras
