<a href="https://colab.research.google.com/github/FadQode/skripsi-modelling/blob/main/dataset-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
##### Data Loading & Understanding ########
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')


#### Data Preparation #######
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer


# Data Loading and Preparation

## Data Loading

In [22]:
DATASET_PATH_1 = '../../dataset/data2_fad.xlsx'
DATASET_PATH_2 = '../../dataset/data2_shafa.xlsx'
DATASET_PATH_3 = '../../dataset/data_subcpmk_fadhil.xlsx'
DATASET_PATH_4 = '../../dataset/data_subcpmk_shafa.xlsx'

In [23]:
df_1 = pd.read_excel(DATASET_PATH_1, skiprows=1)
df_2 = pd.read_excel(DATASET_PATH_2, skiprows=1)
df_3 = pd.read_excel(DATASET_PATH_3)
df_4 = pd.read_excel(DATASET_PATH_4)

df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

df.drop(columns=["Unnamed: 5", "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10"], inplace=True)

df.head()

Unnamed: 0,nomor,level_cpmk,cpmk,level_subcpmk,subcpmk
0,5,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...
1,6,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C2,Menyelesaikan logic functions and gates
2,7,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C5,Mengevaluasi boolean algebra dan combinational...
3,40,C6,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk
4,41,C6,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1288 entries, 0 to 1287
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   nomor          1288 non-null   int64 
 1   level_cpmk     1288 non-null   object
 2   cpmk           1288 non-null   object
 3   level_subcpmk  1288 non-null   object
 4   subcpmk        1288 non-null   object
dtypes: int64(1), object(4)
memory usage: 50.4+ KB


## Cleaning Dataset

In [25]:
df[df.isna().any(axis=1)]

df.dropna(inplace=True)

In [26]:
df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]] = df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]].apply(lambda x: x.astype(str).str.replace('`','', regex=False).str.strip())
df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]] = df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]].apply(lambda x: x.str.strip())
df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]].head()


Unnamed: 0,cpmk,level_cpmk,subcpmk,level_subcpmk
0,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...,C6
1,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Menyelesaikan logic functions and gates,C2
2,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Mengevaluasi boolean algebra dan combinational...,C5
3,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk,C6
4,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk,C6


In [27]:
df["level_cpmk"].str[0]

0       C
1       C
2       C
3       C
4       C
       ..
1283    P
1284    C
1285    C
1286    A
1287    A
Name: level_cpmk, Length: 1288, dtype: object

## Alignment Labeling

In [28]:
def alignment_check(row):
    cpmk_domain = row["level_cpmk"][0]
    cpmk_level  = int(row["level_cpmk"][1])

    sub_domain = row["level_subcpmk"][0]
    sub_level  = int(row["level_subcpmk"][1])

    if cpmk_domain != sub_domain:
        return False

    if cpmk_level < sub_level:
        return False

    return True

df["keselarasan"] = df.apply(alignment_check, axis=1)

df.head()

Unnamed: 0,nomor,level_cpmk,cpmk,level_subcpmk,subcpmk,keselarasan
0,5,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...,True
1,6,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C2,Menyelesaikan logic functions and gates,True
2,7,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C5,Mengevaluasi boolean algebra dan combinational...,True
3,40,C6,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk,True
4,41,C6,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk,True


In [29]:
df["keselarasan"].value_counts()


keselarasan
True     878
False    410
Name: count, dtype: int64

In [30]:
vc_sub = df["level_subcpmk"].value_counts()
vc_cpmk = df["level_cpmk"].value_counts()

hasil = vc_sub.add(vc_cpmk, fill_value=0)
hasil


A2      3.0
A3      5.0
A4     78.0
A5     77.0
C1     87.0
C2    491.0
C3    292.0
C4    488.0
C5    211.0
C6    466.0
P2     29.0
P3    224.0
P4    103.0
P5     22.0
Name: count, dtype: float64

## Modelling Dataset 1 - Learning Outcome Text & Bloom Label

In [31]:
# Pecah CPMK menjadi data terpisah
df_cpmk = df[['cpmk']].copy()
df_cpmk['Jenis'] = 'CPMK'
df_cpmk['Level'] = df['level_cpmk']

df_cpmk.rename(columns={'cpmk':'Learning_Outcome'}, inplace=True)

# Pecah Sub-CPMK
df_sub = df[['subcpmk']].copy()
df_sub['Jenis'] = 'Sub-CPMK'
df_sub['Level'] = df['level_subcpmk']

df_sub.rename(columns={'subcpmk':'Learning_Outcome',
                       'level_subcpmk':'level_cpmk'}, inplace=True)

# Gabungkan
df_learning_outcome = pd.concat([df_cpmk, df_sub], ignore_index=True)
df_learning_outcome




Unnamed: 0,Learning_Outcome,Jenis,Level
0,"Mampu merencanakan, menyelesaikan, dan mengeva...",CPMK,C6
1,"Mampu merencanakan, menyelesaikan, dan mengeva...",CPMK,C6
2,"Mampu merencanakan, menyelesaikan, dan mengeva...",CPMK,C6
3,Mampu merancang gardu induk,CPMK,C6
4,Mampu merancang gardu induk,CPMK,C6
...,...,...,...
2571,Mahasiswa dapat memahami dan mempraktekkan iba...,Sub-CPMK,P3
2572,Mampu menunjukan pelaksanaan penegakan hukum &...,Sub-CPMK,A5
2573,Mahasiswa mampu menunjukkan implementasi nilai...,Sub-CPMK,A5
2574,Mampu bertindak mengimplementasikan makna Sila...,Sub-CPMK,A5


In [32]:
df_learning_outcome.to_csv('../../dataset/learning_outcomes.csv', index=False)


In [33]:
df_learning_outcome["Level"].value_counts()

Level
C2    491
C4    488
C6    466
C3    292
P3    224
C5    211
P4    103
C1     87
A4     78
A5     77
P2     29
P5     22
A3      5
A2      3
Name: count, dtype: int64

## Modelling Dataset 2 - Learning - Sub-Learning  & Alignment Label