In [2]:
##### Data Loading & Understanding ########
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')


#### Data Preparation #######
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer


## Data Loading and Preparation

In [76]:
DATASET_PATH_1 = '../dataset/data2_fad.xlsx'
DATASET_PATH_2 = '../dataset/data2_shafa.xlsx'
DATASET_PATH_3 = '../dataset/data_subcpmk_fadhil.xlsx'
DATASET_PATH_4 = '../dataset/data_subcpmk_shafa.xlsx'

In [77]:
df_1 = pd.read_excel(DATASET_PATH_1, skiprows=1)
df_2 = pd.read_excel(DATASET_PATH_2, skiprows=1)
df_3 = pd.read_excel(DATASET_PATH_3)
df_4 = pd.read_excel(DATASET_PATH_4)

df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

df.drop(columns=["Unnamed: 5", "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10"], inplace=True)

df.head()

Unnamed: 0,nomor,level_cpmk,cpmk,level_subcpmk,subcpmk
0,5,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...
1,6,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C2,Menyelesaikan logic functions and gates
2,7,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C5,Mengevaluasi boolean algebra dan combinational...
3,40,C6,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk
4,41,C6,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1288 entries, 0 to 1287
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   nomor          1288 non-null   int64 
 1   level_cpmk     1285 non-null   object
 2   cpmk           1288 non-null   object
 3   level_subcpmk  1288 non-null   object
 4   subcpmk        1288 non-null   object
dtypes: int64(1), object(4)
memory usage: 50.4+ KB


In [79]:
df[df.isna().any(axis=1)]

df.dropna(inplace=True)

In [80]:
df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]] = df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]].apply(lambda x: x.astype(str).str.replace('`','', regex=False).str.strip())
df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]] = df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]].apply(lambda x: x.str.strip())
df[["cpmk", "level_cpmk", "subcpmk","level_subcpmk"]].head()


Unnamed: 0,cpmk,level_cpmk,subcpmk,level_subcpmk
0,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...,C6
1,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Menyelesaikan logic functions and gates,C2
2,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Mengevaluasi boolean algebra dan combinational...,C5
3,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk,C6
4,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk,C6


In [81]:
df["level_cpmk"].str[0]

0       C
1       C
2       C
3       C
4       C
       ..
1283    P
1284    C
1285    C
1286    A
1287    A
Name: level_cpmk, Length: 1285, dtype: object

In [82]:
def alignment_check(row):
    cpmk_domain = row["level_cpmk"][0]       
    cpmk_level  = int(row["level_cpmk"][1]) 

    sub_domain = row["level_subcpmk"][0]
    sub_level  = int(row["level_subcpmk"][1])

    if cpmk_domain != sub_domain:
        return False
    
    if cpmk_level < sub_level:
        return False

    return True

df["keselarasan"] = df.apply(alignment_check, axis=1)

df.head()

Unnamed: 0,nomor,level_cpmk,cpmk,level_subcpmk,subcpmk,keselarasan
0,5,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C6,Merencanakan aplikasi menggunakan prinsip dasa...,True
1,6,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C2,Menyelesaikan logic functions and gates,True
2,7,C6,"Mampu merencanakan, menyelesaikan, dan mengeva...",C5,Mengevaluasi boolean algebra dan combinational...,True
3,40,C6,Mampu merancang gardu induk,C6,Mampu merancang instalasi listrik gardu induk,True
4,41,C6,Mampu merancang gardu induk,C6,Mampu merancang sistem pengetanahan gardu induk,True


In [83]:
df["keselarasan"].value_counts()


keselarasan
True     876
False    409
Name: count, dtype: int64

In [84]:
vc_sub = df["level_subcpmk"].value_counts()
vc_cpmk = df["level_cpmk"].value_counts()

hasil = vc_sub.add(vc_cpmk, fill_value=0)
hasil


A2      3.0
A3      5.0
A4     78.0
A5     76.0
A6      1.0
C1     87.0
C2    489.0
C3    292.0
C4    487.0
C5    211.0
C6    465.0
P2     29.0
P3    224.0
P4    101.0
P5     22.0
Name: count, dtype: float64