# **Sistem Rekomendasi Pencarian Magang Merdeka Content Based Filtering**

## **Data Scraping**

In [1]:
import requests
import pandas as pd

In [None]:
url_opportunities = "https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/opportunities"
url_detail = "https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/opportunities/{}"
url_mitra = "https://api.kampusmerdeka.kemdikbud.go.id/mitra/public/id/{}"
limit = 100
offset = 0

all_data = pd.DataFrame()

while True:
    params = {
        "offset": offset,
        "limit": limit,
        "location_key": "",
        "mitra_key": "",
        "keyword": "",
        "sector_id": "",
        "sort_by": "",
        "order": "desc"
    }

    response = requests.get(url_opportunities, params=params)

    offset += limit
    data = response.json()["data"]
    if len(data) == 0:
        break

    # Mengupulkan detail data setiap id entry
    for entry_id in data:
        detail_url = url_detail.format(entry_id["id"])
        mitra_url = url_mitra.format(entry_id["mitra_id"])

        detail_response = requests.get(detail_url)
        mitra_url_response = requests.get(mitra_url)

        mitra_data = pd.json_normalize(mitra_url_response.json())
        detail_data = pd.json_normalize(detail_response.json())

        # Menghilangkan prefix "data." dan "benefits."
        mitra_data.columns = [col.replace("data.", "") for col in mitra_data.columns]
        detail_data.columns = [col.replace("data.", "").replace("benefits.", "") for col in detail_data.columns]

        detail_data['mitra_name'] = mitra_data['name']

        all_data = pd.concat([all_data, detail_data], ignore_index=True)

    print(f"Data collected: {len(all_data)}")

print(f"Total {len(all_data)} data collected")

In [None]:
all_data

In [None]:
all_data.to_csv('magang_opportunities.csv', index=False)

## **Data PreProcessing**

### **Data Reviewing**

In [2]:
import pandas as pd

In [3]:
magang_opportunities = pd.read_csv('magang_opportunities.csv')

magang_opportunities.head()

Unnamed: 0,meta,id,name,description,mitra_id,start_period,months_duration,activity_type,location,fields_of_study,...,show_salary,mobilization,accommodation,is_applied,wishlist,is_external,external_platform_name,external_platform_logo_url,mitra_name,salary
0,,b794e1a1-ecd6-449d-9ed9-0118a2e02626,Website Developer,<p><strong>Kualifikasi :</strong>&nbsp;</p><ul...,c7746ff1-87de-47bb-a43f-6d63f521e07c,2024-03-01T00:00:00+07:00,3,WFH,Kota Surabaya,[],...,False,False,False,False,False,False,,,CV. APPAREL BERKAH SELALU,
1,,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,Software Architect Intern,<p>To help architect team on demonstration of ...,af00bd20-3ac2-44a4-877f-af4307f9b599,2024-03-01T00:00:00+07:00,6,WFO,Kota Jakarta Selatan,['Teknik Informatika dan Ilmu Komputer'],...,False,False,False,False,False,False,,,PT. Idemia Technologies Indonesia,
2,,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,Product Marketing Intern,<p>- Design and develop sharepoint page to sup...,af00bd20-3ac2-44a4-877f-af4307f9b599,2024-03-01T00:00:00+07:00,6,WFO,Kota Jakarta Selatan,['Teknik Informatika dan Ilmu Komputer'],...,False,False,False,False,False,False,,,PT. Idemia Technologies Indonesia,
3,,fc01895c-0d74-4874-a8f2-d1d46956d697,Asisten Finance,<ol><li>Pemahaman Sistem Keuangan Sekolah: And...,0bf5a4d1-9980-48ae-ad65-48e3b239f15c,2024-03-01T00:00:00+07:00,4,WFO,Kota Banda Aceh,[],...,False,False,False,False,False,False,,,SMAN 14 Banda Aceh,
4,,b6062e51-4764-474c-a2b7-617affd56959,Marketing Asuransi Mini Mikro,<p>WFH dari Seluruh Indonesia.</p><p><br></p><...,6bbc729e-c3ee-46a1-93be-2b9f3b5f478d,2024-03-01T00:00:00+07:00,4,WFH,Kota Banda Aceh,[],...,False,False,False,False,False,False,,,Natural Aceh,


In [4]:
data = pd.DataFrame({
  'id': list(magang_opportunities['id'].values),
  'name': list(magang_opportunities['name'].values),
  'description': list(magang_opportunities['description'].values),
  'skills': list(magang_opportunities['skills'].values)
})

data

Unnamed: 0,id,name,description,skills
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,Website Developer,<p><strong>Kualifikasi :</strong>&nbsp;</p><ul...,"['Komunikasi', 'Kreatif', 'Pemahaman Industri'..."
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,Software Architect Intern,<p>To help architect team on demonstration of ...,"['Komunikasi', 'Kreatif', 'Desain Grafis', 'Co..."
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,Product Marketing Intern,<p>- Design and develop sharepoint page to sup...,"['Kreatif', 'Analisis Data', 'Pemahaman Lingku..."
3,fc01895c-0d74-4874-a8f2-d1d46956d697,Asisten Finance,<ol><li>Pemahaman Sistem Keuangan Sekolah: And...,"['Administrasi', 'Operasional', 'Komunikasi', ..."
4,b6062e51-4764-474c-a2b7-617affd56959,Marketing Asuransi Mini Mikro,<p>WFH dari Seluruh Indonesia.</p><p><br></p><...,"['Operasional', 'Kreatif', 'Administrasi', 'Ko..."
...,...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,Marketing Creative Intern,Responsibilities \n\nSupport Marketing and Bra...,
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,Graphic Design (Intern),Kami Idea merupakan sebuah usaha atau bisnis y...,
679,5a704786-6865-40c9-9086-a2bd85dc5098,Influencer Management Intern,- Last semester student from reputable univers...,
680,780902fd-e51a-463c-a012-e7a0941da47e,New Business Development Internship (Jakarta U...,- Final year student or fresh graduate \n- Pas...,


### **Combine Text**

In [5]:
import ast

In [6]:
# Convert the "skills" column from string to list for non-null values
data['skills'] = data['skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

# Join the skills into a single string
data['skills'] = data['skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Combine "name," "description," and "skills" into a new column
data['combined_text'] = data['name'] + ' ' + data['description'] + ' ' + data['skills'].astype(str)

data['combined_text']

0      Website Developer <p><strong>Kualifikasi :</st...
1      Software Architect Intern <p>To help architect...
2      Product Marketing Intern <p>- Design and devel...
3      Asisten Finance <ol><li>Pemahaman Sistem Keuan...
4      Marketing Asuransi Mini Mikro  <p>WFH dari Sel...
                             ...                        
677    Marketing Creative Intern Responsibilities \n\...
678    Graphic Design (Intern) Kami Idea merupakan se...
679    Influencer Management Intern - Last semester s...
680    New Business Development Internship (Jakarta U...
681    Finance & Accounting Intern Kualifikasi :\n- M...
Name: combined_text, Length: 682, dtype: object

### **Case Folding**

In [8]:
data['result_case_folding_data'] = data['combined_text'].apply(lambda x: x.casefold())
data[['id', 'combined_text','result_case_folding_data']]

Unnamed: 0,id,combined_text,result_case_folding_data
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,Website Developer <p><strong>Kualifikasi :</st...,website developer <p><strong>kualifikasi :</st...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,Software Architect Intern <p>To help architect...,software architect intern <p>to help architect...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,Product Marketing Intern <p>- Design and devel...,product marketing intern <p>- design and devel...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,Asisten Finance <ol><li>Pemahaman Sistem Keuan...,asisten finance <ol><li>pemahaman sistem keuan...
4,b6062e51-4764-474c-a2b7-617affd56959,Marketing Asuransi Mini Mikro <p>WFH dari Sel...,marketing asuransi mini mikro <p>wfh dari sel...
...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,Marketing Creative Intern Responsibilities \n\...,marketing creative intern responsibilities \n\...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,Graphic Design (Intern) Kami Idea merupakan se...,graphic design (intern) kami idea merupakan se...
679,5a704786-6865-40c9-9086-a2bd85dc5098,Influencer Management Intern - Last semester s...,influencer management intern - last semester s...
680,780902fd-e51a-463c-a012-e7a0941da47e,New Business Development Internship (Jakarta U...,new business development internship (jakarta u...


### **Remove Html Tags**

In [9]:
from lxml import etree

In [10]:
def remove_html_tags(text):
    parser = etree.HTMLParser()
    tree = etree.fromstring(text, parser)
    return etree.tostring(tree, encoding='unicode', method='text')

data['result_remove_html_tags'] = data['result_case_folding_data'].apply(remove_html_tags)
data[['id', 'result_case_folding_data' ,'result_remove_html_tags']]

Unnamed: 0,id,result_case_folding_data,result_remove_html_tags
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,website developer <p><strong>kualifikasi :</st...,website developer kualifikasi : wajib memiliki...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,software architect intern <p>to help architect...,software architect intern to help architect te...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,product marketing intern <p>- design and devel...,product marketing intern - design and develop ...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,asisten finance <ol><li>pemahaman sistem keuan...,asisten finance pemahaman sistem keuangan seko...
4,b6062e51-4764-474c-a2b7-617affd56959,marketing asuransi mini mikro <p>wfh dari sel...,marketing asuransi mini mikro wfh dari seluru...
...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,marketing creative intern responsibilities \n\...,marketing creative intern responsibilities \n\...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,graphic design (intern) kami idea merupakan se...,graphic design (intern) kami idea merupakan se...
679,5a704786-6865-40c9-9086-a2bd85dc5098,influencer management intern - last semester s...,influencer management intern - last semester s...
680,780902fd-e51a-463c-a012-e7a0941da47e,new business development internship (jakarta u...,new business development internship (jakarta u...


### **Remove Unicode Character**

In [11]:
import re

In [13]:
data['result_remove_unicode_character'] = data['result_remove_html_tags'].apply(lambda x: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))
data[['id', 'result_remove_html_tags', 'result_remove_unicode_character']]

Unnamed: 0,id,result_remove_html_tags,result_remove_unicode_character
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,website developer kualifikasi : wajib memiliki...,website developer kualifikasi wajib memiliki l...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,software architect intern to help architect te...,software architect intern to help architect te...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,product marketing intern - design and develop ...,product marketing intern design and develop s...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,asisten finance pemahaman sistem keuangan seko...,asisten finance pemahaman sistem keuangan seko...
4,b6062e51-4764-474c-a2b7-617affd56959,marketing asuransi mini mikro wfh dari seluru...,marketing asuransi mini mikro wfh dari seluru...
...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,marketing creative intern responsibilities \n\...,marketing creative intern responsibilities sup...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,graphic design (intern) kami idea merupakan se...,graphic design intern kami idea merupakan sebu...
679,5a704786-6865-40c9-9086-a2bd85dc5098,influencer management intern - last semester s...,influencer management intern last semester st...
680,780902fd-e51a-463c-a012-e7a0941da47e,new business development internship (jakarta u...,new business development internship jakarta ut...


### **Tokenization**

In [14]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aszay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
data['result_tokenization'] = data['result_remove_unicode_character'].apply(lambda x: word_tokenize(x))
data[['id', 'result_remove_unicode_character', 'result_tokenization']]

Unnamed: 0,id,result_remove_unicode_character,result_tokenization
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,website developer kualifikasi wajib memiliki l...,"[website, developer, kualifikasi, wajib, memil..."
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,software architect intern to help architect te...,"[software, architect, intern, to, help, archit..."
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,product marketing intern design and develop s...,"[product, marketing, intern, design, and, deve..."
3,fc01895c-0d74-4874-a8f2-d1d46956d697,asisten finance pemahaman sistem keuangan seko...,"[asisten, finance, pemahaman, sistem, keuangan..."
4,b6062e51-4764-474c-a2b7-617affd56959,marketing asuransi mini mikro wfh dari seluru...,"[marketing, asuransi, mini, mikro, wfh, dari, ..."
...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,marketing creative intern responsibilities sup...,"[marketing, creative, intern, responsibilities..."
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,graphic design intern kami idea merupakan sebu...,"[graphic, design, intern, kami, idea, merupaka..."
679,5a704786-6865-40c9-9086-a2bd85dc5098,influencer management intern last semester st...,"[influencer, management, intern, last, semeste..."
680,780902fd-e51a-463c-a012-e7a0941da47e,new business development internship jakarta ut...,"[new, business, development, internship, jakar..."


### **Remove Stopwords**

In [16]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aszay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
indo_stopwords = stopwords.words('indonesian')
eng_stopwords = stopwords.words('english')

list_stopwords = indo_stopwords + eng_stopwords

In [18]:
def remove_stopwords(text):
    filtered_words = [word for word in text if word not in list_stopwords]
    return filtered_words

# Apply remove_stopwords to the 'result_tokenization' column
data['result_remove_stopwords'] = data['result_tokenization'].apply(remove_stopwords)

# Optionally, join the list of words back into a string if needed
data['result_remove_stopwords'] = data['result_remove_stopwords'].apply(lambda x: ' '.join(x))

data[['id', 'result_tokenization', 'result_remove_stopwords']]

Unnamed: 0,id,result_tokenization,result_remove_stopwords
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,"[website, developer, kualifikasi, wajib, memil...",website developer kualifikasi wajib memiliki l...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,"[software, architect, intern, to, help, archit...",software architect intern help architect team ...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,"[product, marketing, intern, design, and, deve...",product marketing intern design develop sharep...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,"[asisten, finance, pemahaman, sistem, keuangan...",asisten finance pemahaman sistem keuangan seko...
4,b6062e51-4764-474c-a2b7-617affd56959,"[marketing, asuransi, mini, mikro, wfh, dari, ...",marketing asuransi mini mikro wfh indonesiapen...
...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,"[marketing, creative, intern, responsibilities...",marketing creative intern responsibilities sup...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,"[graphic, design, intern, kami, idea, merupaka...",graphic design intern idea usaha bisnis berger...
679,5a704786-6865-40c9-9086-a2bd85dc5098,"[influencer, management, intern, last, semeste...",influencer management intern last semester stu...
680,780902fd-e51a-463c-a012-e7a0941da47e,"[new, business, development, internship, jakar...",new business development internship jakarta ut...


### **Stemming**

In [21]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [22]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_word(text):
    return stemmer.stem(text)

data['result_stemming'] = data['result_remove_stopwords'].apply(stem_word)

cleaned_data = data[['id', 'result_stemming']].rename(columns={'result_stemming': 'text'})

cleaned_data

Unnamed: 0,id,text
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,website developer kualifikasi wajib milik lapt...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,software architect intern help architect team ...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,product marketing intern design develop sharep...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,asisten finance paham sistem uang sekolah ajak...
4,b6062e51-4764-474c-a2b7-617affd56959,marketing asuransi mini mikro wfh indonesiapen...
...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,marketing creative intern responsibilities sup...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,graphic design intern idea usaha bisnis gerak ...
679,5a704786-6865-40c9-9086-a2bd85dc5098,influencer management intern last semester stu...
680,780902fd-e51a-463c-a012-e7a0941da47e,new business development internship jakarta ut...


## **Data Weighing**

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
tfidf = TfidfVectorizer(stop_words=list_stopwords)

tfidf_matrix = tfidf.fit_transform(cleaned_data['text'])

tfidf_matrix.shape

tfidf_matrix



<682x6316 sparse matrix of type '<class 'numpy.float64'>'
	with 37150 stored elements in Compressed Sparse Row format>

## **Data Modelling**

### **Setup Model**

In [25]:
from sklearn.metrics.pairwise import linear_kernel

In [30]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(cleaned_data.index, index=cleaned_data['text']).drop_duplicates()

### **Setup Function**

In [61]:
def recommend_magang(search_value, cosine_sim=cosine_sim):
    recommendation = pd.DataFrame(columns=['id', 'name', 'score'])

    # Check if the search_value is in the indices
    if search_value not in indices:
        print(f"'{search_value}' not found in the indices.")
        return recommendation

    # Get the index of the search_value
    idx = indices[search_value]

    # Get the pairwise similarity scores of all items with the search_value
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar items
    sim_scores = sim_scores[1:11]

    # Get the item indices and names
    magang_indices = [i[0] for i in sim_scores]
    magang_names = magang_opportunities.loc[magang_indices, 'name']

    # Create the recommendation DataFrame
    recommendation['id'] = magang_opportunities.loc[magang_indices, 'id'].tolist()
    recommendation['name'] = magang_names.tolist()
    recommendation['score'] = [score[1] for score in sim_scores]

    return recommendation

## **Evaluation**

In [72]:
# Assuming your recommendation function is already defined
result = recommend_magang('website developer kualifikasi')

if not result.empty:
    print(result)
else:
    print("No recommendations found.")

                                     id  \
0  fe263d29-532a-4860-9d91-6ec91e304b50   
1  fccec0c2-85e4-4d27-ac3b-01b455dfcbfe   
2  fc000acf-2901-495e-bcc7-427eef1a39ca   
3  83681309-8513-44ba-8f0d-15fa0ccef9a9   
4  a8e3a588-a4a9-4995-bb74-9023b17efb3f   
5  aedf0eb6-a060-4984-8d14-8da0862681cb   
6  ee681cbf-582a-4dd8-88e3-41079bc5d150   
7  7a963c8f-9a6c-4d23-accd-d245bc6b1493   
8  f54f86ef-fa76-44a8-bc20-3415772513ca   
9  1fe9c603-c8a2-4d8e-8362-846e8a2ce82f   

                                      name     score  
0                      Manajemen Kemitraan  0.220185  
1  Internship Content Creator & Copywriter  0.177466  
2                          Content Creator  0.175047  
3                         Content Creator   0.169300  
4                          Web Specialist   0.132920  
5                   Content Maker (Intern)  0.122996  
6          Social Media Manager Internship  0.117652  
7              Content Planner/ Strategist  0.117409  
8                Jr Content Cre

In [65]:
print(indices.keys())

Index(['website developer kualifikasi wajib milik laptop sendirimampu website landing page sesuai butuh usaha optimasi seomasa magang minimum 3 jam kerja normalharus serta portofoliobenefit sertifikat magangkonversi nilaisistem wfh komunikasi kreatif paham industri manajemen proyek kreativitas kerjasama tim ahli teknologi pasar produk',
       'software architect intern help architect team demonstration improvement idea work focused determining whether idea turned realitythe improvement might include test optimization test automation framework usage eg test framework design komunikasi kreatif desain grafis coding',
       'product marketing intern design develop sharepoint page support business operation collaborate team get data sharepoint page support documentation products willing learn new things kreatif analisis data paham lingkung bisnis komunikasi',
       'asisten finance paham sistem uang sekolah ajak paham sistem uang sekolah proses bayar lapor uang kelola anggaranpencatatan 