# **Sistem Rekomendasi Pencarian Magang Merdeka Content Based Filtering**

## **Data Scraping**

In [30]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time

In [31]:
url_opportunities = "https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/opportunities"
url_detail = "https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/position/{}"
limit = 100
offset = 0

all_data = pd.DataFrame()

# Create a session and configure retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

# This function is used to fetch data from the API with retries
def fetch_with_retries(url, params=None):
    for attempt in range(3):
        try:
            response = session.get(url, params=params, timeout=10)
            response.raise_for_status()
            return response
        except (requests.exceptions.RequestException, ConnectionError) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff
    raise Exception(f"Failed to fetch data from {url} after multiple retries")

while True:
    params = {
        "offset": offset,
        "limit": limit,
        "location_key": "",
        "mitra_key": "",
        "keyword": "",
        "sector_id": "",
        "sort_by": "",
        "order": "desc"
    }

    response = fetch_with_retries(url_opportunities, params=params)

    offset += limit
    data = response.json()["data"]
    if len(data) == 0:
        break

    # Filtering data based on opportunity type
    for entry in data:
        if entry.get("opportunity_type") != "MSIB":
            continue

        detail_url = url_detail.format(entry["id"])

        # Try to fetch the detail data for the current entry
        try:
            detail_response = fetch_with_retries(detail_url)
        except Exception as e:
            print(f"URL failed for ID {entry['id']} with error: {e}")
            continue  # Skip to the next entry if the detail URL fails

        # Converting data to DataFrame
        entry_data = pd.json_normalize(entry)
        detail_data = pd.json_normalize(detail_response.json())

        # Adding prefixes to avoid column name collisions
        detail_data.columns = [f"detail_{col.replace('data.', '').replace('activity_id.', '')}" for col in detail_data.columns]

        # Merging entry data with detail data
        combined_data = pd.concat([entry_data, detail_data], axis=1)

        # Adding combined data to all_data
        all_data = pd.concat([all_data, combined_data], ignore_index=True)

    print(f"Data collected: {len(all_data)}")

print(f"Total {len(all_data)} data collected")

Attempt 1 failed: 400 Client Error: Bad Request for url: https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/position/1bd3ee90-0487-11ef-9e17-0a54edb93563
Attempt 2 failed: 400 Client Error: Bad Request for url: https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/position/1bd3ee90-0487-11ef-9e17-0a54edb93563
Attempt 3 failed: 400 Client Error: Bad Request for url: https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/position/1bd3ee90-0487-11ef-9e17-0a54edb93563
URL failed for ID 1bd3ee90-0487-11ef-9e17-0a54edb93563 with error: Failed to fetch data from https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/position/1bd3ee90-0487-11ef-9e17-0a54edb93563 after multiple retries
Attempt 1 failed: 400 Client Error: Bad Request for url: https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/position/dd50bfbe-0439-11ef-866a-ca0bbc909a4f
Attempt 2 failed: 400 Client Error: Bad Request for url: https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/position/dd50bfbe-0439-11ef-866a-

In [32]:
all_data

Unnamed: 0,id,opportunity_type,name,activity_type,location,months_duration,start_registration,end_registration,mitra_id,mitra_name,...,detail_credits_count,detail_activity_type,detail_location,detail_location_kotakab_code,detail_mitra_id,detail_certified,detail_skills,detail_available_to_apply,detail_is_quota_full,detail_activity_active_id
0,190c81c5-048a-11ef-9e17-0a54edb93563,MSIB,Layanan Publik,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,046000,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': '57ddce5c-048b-11ef-991a-ae222c5b6357'...,True,False,0
1,1b162247-048d-11ef-bb03-ceeddaa1b367,MSIB,Pengelola Arsip,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,046000,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': '6c0e2f08-048e-11ef-bb03-ceeddaa1b367'...,True,False,0
2,f2b8c752-04a0-11ef-8394-36f764739585,MSIB,Library Quality Analysis Staff and Integrated ...,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,046000,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': 'ee5cb0e2-04a5-11ef-8394-36f764739585'...,True,False,0
3,90665e71-0490-11ef-bb7f-6ac06bc5928b,MSIB,Staf Manajemen,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,046000,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': '9075a8e1-0492-11ef-9b3e-022ae5f04a49'...,True,False,0
4,60636725-0494-11ef-9e17-0a54edb93563,MSIB,Full-stack Developer,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,046000,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': '393df447-0495-11ef-bb03-ceeddaa1b367'...,True,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5815,79eaa62e-0363-11ef-98a1-c2fa75b94ea6,MSIB,Sales Relationship,WFO,Kota Manado,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,9521e331-1014-4f9f-b783-4b0008083ec1,Perusahaan Umum Pembangunan Perumahan Nasional,...,20,OFFLINE,Kota Manado,176000,9521e331-1014-4f9f-b783-4b0008083ec1,True,[{'id': '70bc4742-0364-11ef-9b3e-022ae5f04a49'...,True,False,0
5816,3e83727f-005c-11ef-8f19-ae84a611b5f1,MSIB,Site Engineer,WFO,Kab. Purwakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,9521e331-1014-4f9f-b783-4b0008083ec1,Perusahaan Umum Pembangunan Perumahan Nasional,...,20,OFFLINE,Kab. Purwakarta,022000,9521e331-1014-4f9f-b783-4b0008083ec1,True,[{'id': '9f9fbfd0-005c-11ef-ba24-e26d212b611d'...,True,False,0
5817,310433fc-0073-11ef-9733-ca7bef2c0196,MSIB,Marketing Officer,WFO,Kota Palembang,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,9521e331-1014-4f9f-b783-4b0008083ec1,Perusahaan Umum Pembangunan Perumahan Nasional,...,20,OFFLINE,Kota Palembang,116000,9521e331-1014-4f9f-b783-4b0008083ec1,True,[{'id': 'c5860081-0074-11ef-ac63-7e456e52b7d0'...,True,False,0
5818,686efb0a-0079-11ef-839c-1e2e912d270c,MSIB,Marketing Analysis Officer,WFO,Kota Jakarta Timur,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,9521e331-1014-4f9f-b783-4b0008083ec1,Perusahaan Umum Pembangunan Perumahan Nasional,...,20,OFFLINE,Kota Jakarta Timur,016400,9521e331-1014-4f9f-b783-4b0008083ec1,True,[{'id': 'af3eafe4-007a-11ef-9733-ca7bef2c0196'...,True,False,0


In [34]:
all_data.to_csv('../data/magang_opportunities.csv', index=False)

## **Data PreProcessing**

### **Data Reviewing**

In [1]:
import pandas as pd

In [5]:
magang_opportunities = pd.read_csv('../data/magang_opportunities.csv')

magang_opportunities

Unnamed: 0,id,opportunity_type,name,activity_type,location,months_duration,start_registration,end_registration,mitra_id,mitra_name,...,detail_credits_count,detail_activity_type,detail_location,detail_location_kotakab_code,detail_mitra_id,detail_certified,detail_skills,detail_available_to_apply,detail_is_quota_full,detail_activity_active_id
0,190c81c5-048a-11ef-9e17-0a54edb93563,MSIB,Layanan Publik,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,46000.0,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': '57ddce5c-048b-11ef-991a-ae222c5b6357'...,True,False,0
1,1b162247-048d-11ef-bb03-ceeddaa1b367,MSIB,Pengelola Arsip,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,46000.0,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': '6c0e2f08-048e-11ef-bb03-ceeddaa1b367'...,True,False,0
2,f2b8c752-04a0-11ef-8394-36f764739585,MSIB,Library Quality Analysis Staff and Integrated ...,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,46000.0,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': 'ee5cb0e2-04a5-11ef-8394-36f764739585'...,True,False,0
3,90665e71-0490-11ef-bb7f-6ac06bc5928b,MSIB,Staf Manajemen,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,46000.0,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': '9075a8e1-0492-11ef-9b3e-022ae5f04a49'...,True,False,0
4,60636725-0494-11ef-9e17-0a54edb93563,MSIB,Full-stack Developer,WFO,Kota Yogyakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,4afed716-68b7-48d4-bddb-6fa73a3433bc,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,...,20,OFFLINE,Kota Yogyakarta,46000.0,4afed716-68b7-48d4-bddb-6fa73a3433bc,True,[{'id': '393df447-0495-11ef-bb03-ceeddaa1b367'...,True,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5815,79eaa62e-0363-11ef-98a1-c2fa75b94ea6,MSIB,Sales Relationship,WFO,Kota Manado,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,9521e331-1014-4f9f-b783-4b0008083ec1,Perusahaan Umum Pembangunan Perumahan Nasional,...,20,OFFLINE,Kota Manado,176000.0,9521e331-1014-4f9f-b783-4b0008083ec1,True,[{'id': '70bc4742-0364-11ef-9b3e-022ae5f04a49'...,True,False,0
5816,3e83727f-005c-11ef-8f19-ae84a611b5f1,MSIB,Site Engineer,WFO,Kab. Purwakarta,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,9521e331-1014-4f9f-b783-4b0008083ec1,Perusahaan Umum Pembangunan Perumahan Nasional,...,20,OFFLINE,Kab. Purwakarta,22000.0,9521e331-1014-4f9f-b783-4b0008083ec1,True,[{'id': '9f9fbfd0-005c-11ef-ba24-e26d212b611d'...,True,False,0
5817,310433fc-0073-11ef-9733-ca7bef2c0196,MSIB,Marketing Officer,WFO,Kota Palembang,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,9521e331-1014-4f9f-b783-4b0008083ec1,Perusahaan Umum Pembangunan Perumahan Nasional,...,20,OFFLINE,Kota Palembang,116000.0,9521e331-1014-4f9f-b783-4b0008083ec1,True,[{'id': 'c5860081-0074-11ef-ac63-7e456e52b7d0'...,True,False,0
5818,686efb0a-0079-11ef-839c-1e2e912d270c,MSIB,Marketing Analysis Officer,WFO,Kota Jakarta Timur,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,9521e331-1014-4f9f-b783-4b0008083ec1,Perusahaan Umum Pembangunan Perumahan Nasional,...,20,OFFLINE,Kota Jakarta Timur,16400.0,9521e331-1014-4f9f-b783-4b0008083ec1,True,[{'id': 'af3eafe4-007a-11ef-9733-ca7bef2c0196'...,True,False,0


In [112]:
data = pd.DataFrame({
    'id': magang_opportunities['id'].values,
    'mitra_name': magang_opportunities['mitra_name'].values,
    'name': magang_opportunities['name'].values,
    'additional_title': magang_opportunities['detail_additional_title'].values,
    'skills': magang_opportunities['detail_skills'].values
})

data

Unnamed: 0,id,mitra_name,name,additional_title,skills
0,190c81c5-048a-11ef-9e17-0a54edb93563,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,Layanan Publik,Public Relations Services Management-Subbagian...,[{'id': '57ddce5c-048b-11ef-991a-ae222c5b6357'...
1,1b162247-048d-11ef-bb03-ceeddaa1b367,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,Pengelola Arsip,Layanan Pengelolaan Kearsipan,[{'id': '6c0e2f08-048e-11ef-bb03-ceeddaa1b367'...
2,f2b8c752-04a0-11ef-8394-36f764739585,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,Library Quality Analysis Staff and Integrated ...,Learning Management Improvement Of Public Serv...,[{'id': 'ee5cb0e2-04a5-11ef-8394-36f764739585'...
3,90665e71-0490-11ef-bb7f-6ac06bc5928b,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,Staf Manajemen,Licensing And Management Administration- Balai...,[{'id': '9075a8e1-0492-11ef-9b3e-022ae5f04a49'...
4,60636725-0494-11ef-9e17-0a54edb93563,Dinas Lingkungan Hidup Dan Kehutanan Daerah Is...,Full-stack Developer,Information Technology Systems,[{'id': '393df447-0495-11ef-bb03-ceeddaa1b367'...
...,...,...,...,...,...
5815,79eaa62e-0363-11ef-98a1-c2fa75b94ea6,Perusahaan Umum Pembangunan Perumahan Nasional,Sales Relationship,Pemahaman & Pengembangan Strategi Bisnis Perum...,[{'id': '70bc4742-0364-11ef-9b3e-022ae5f04a49'...
5816,3e83727f-005c-11ef-8f19-ae84a611b5f1,Perusahaan Umum Pembangunan Perumahan Nasional,Site Engineer,Pengawasan Serta Monitoring Pembangunan Kawasa...,[{'id': '9f9fbfd0-005c-11ef-ba24-e26d212b611d'...
5817,310433fc-0073-11ef-9733-ca7bef2c0196,Perusahaan Umum Pembangunan Perumahan Nasional,Marketing Officer,Strategi Pemasaran Efektif : Pemetaan Segmenti...,[{'id': 'c5860081-0074-11ef-ac63-7e456e52b7d0'...
5818,686efb0a-0079-11ef-839c-1e2e912d270c,Perusahaan Umum Pembangunan Perumahan Nasional,Marketing Analysis Officer,Pengembangan Properti Komersial Berdasarkan An...,[{'id': 'af3eafe4-007a-11ef-9733-ca7bef2c0196'...


### **Combine Text**

In [113]:
import ast

In [114]:
# Convert the "skills" column from string to list for non-null values
data['skills'] = data['skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

# Took the skill name from "skills" column
data['skills'] = data['skills'].apply(lambda x: [skill['name'] for skill in x] if isinstance(x, list) else x)

# Join the skills into a single string
data['skills'] = data['skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [115]:
# Combine "name", "additional_title", and "skills" into a new column
data['combined_text'] = data['name']+ ' ' + data['additional_title'] + ' ' + data['skills'].astype(str)

data['combined_text']

0       Layanan Publik Public Relations Services Manag...
1       Pengelola Arsip Layanan Pengelolaan Kearsipan ...
2       Library Quality Analysis Staff and Integrated ...
3       Staf Manajemen Licensing And Management Admini...
4       Full-stack Developer Information Technology Sy...
                              ...                        
5815    Sales Relationship Pemahaman & Pengembangan St...
5816    Site Engineer Pengawasan Serta Monitoring Pemb...
5817    Marketing Officer Strategi Pemasaran Efektif :...
5818    Marketing Analysis Officer Pengembangan Proper...
5819    Full Stack Web Developer INDI System Developme...
Name: combined_text, Length: 5820, dtype: object

### **Case Folding**

In [116]:
data['result_casefold_mitra_name'] = data['mitra_name'].apply(lambda x: x.casefold())
data['result_casefold_data'] = data['combined_text'].apply(lambda x: x.casefold())
data[['id', 'combined_text', 'result_casefold_mitra_name','result_casefold_data']]

Unnamed: 0,id,combined_text,result_casefold_mitra_name,result_casefold_data
0,190c81c5-048a-11ef-9e17-0a54edb93563,Layanan Publik Public Relations Services Manag...,dinas lingkungan hidup dan kehutanan daerah is...,layanan publik public relations services manag...
1,1b162247-048d-11ef-bb03-ceeddaa1b367,Pengelola Arsip Layanan Pengelolaan Kearsipan ...,dinas lingkungan hidup dan kehutanan daerah is...,pengelola arsip layanan pengelolaan kearsipan ...
2,f2b8c752-04a0-11ef-8394-36f764739585,Library Quality Analysis Staff and Integrated ...,dinas lingkungan hidup dan kehutanan daerah is...,library quality analysis staff and integrated ...
3,90665e71-0490-11ef-bb7f-6ac06bc5928b,Staf Manajemen Licensing And Management Admini...,dinas lingkungan hidup dan kehutanan daerah is...,staf manajemen licensing and management admini...
4,60636725-0494-11ef-9e17-0a54edb93563,Full-stack Developer Information Technology Sy...,dinas lingkungan hidup dan kehutanan daerah is...,full-stack developer information technology sy...
...,...,...,...,...
5815,79eaa62e-0363-11ef-98a1-c2fa75b94ea6,Sales Relationship Pemahaman & Pengembangan St...,perusahaan umum pembangunan perumahan nasional,sales relationship pemahaman & pengembangan st...
5816,3e83727f-005c-11ef-8f19-ae84a611b5f1,Site Engineer Pengawasan Serta Monitoring Pemb...,perusahaan umum pembangunan perumahan nasional,site engineer pengawasan serta monitoring pemb...
5817,310433fc-0073-11ef-9733-ca7bef2c0196,Marketing Officer Strategi Pemasaran Efektif :...,perusahaan umum pembangunan perumahan nasional,marketing officer strategi pemasaran efektif :...
5818,686efb0a-0079-11ef-839c-1e2e912d270c,Marketing Analysis Officer Pengembangan Proper...,perusahaan umum pembangunan perumahan nasional,marketing analysis officer pengembangan proper...


### **Remove Html Tags**

In [111]:
from lxml import etree

In [120]:
def remove_html_tags(text):
    parser = etree.HTMLParser()
    tree = etree.fromstring(text, parser)
    return etree.tostring(tree, encoding='unicode', method='text')

data['result_remove_html_mitra_name'] = data['result_casefold_mitra_name'].apply(remove_html_tags)
data['result_remove_html_tags'] = data['result_casefold_data'].apply(remove_html_tags)
data[['id', 'result_casefold_data', 'result_remove_html_mitra_name', 'result_remove_html_tags']]

Unnamed: 0,id,result_casefold_data,result_remove_html_mitra_name,result_remove_html_tags
0,190c81c5-048a-11ef-9e17-0a54edb93563,layanan publik public relations services manag...,dinas lingkungan hidup dan kehutanan daerah is...,layanan publik public relations services manag...
1,1b162247-048d-11ef-bb03-ceeddaa1b367,pengelola arsip layanan pengelolaan kearsipan ...,dinas lingkungan hidup dan kehutanan daerah is...,pengelola arsip layanan pengelolaan kearsipan ...
2,f2b8c752-04a0-11ef-8394-36f764739585,library quality analysis staff and integrated ...,dinas lingkungan hidup dan kehutanan daerah is...,library quality analysis staff and integrated ...
3,90665e71-0490-11ef-bb7f-6ac06bc5928b,staf manajemen licensing and management admini...,dinas lingkungan hidup dan kehutanan daerah is...,staf manajemen licensing and management admini...
4,60636725-0494-11ef-9e17-0a54edb93563,full-stack developer information technology sy...,dinas lingkungan hidup dan kehutanan daerah is...,full-stack developer information technology sy...
...,...,...,...,...
5815,79eaa62e-0363-11ef-98a1-c2fa75b94ea6,sales relationship pemahaman & pengembangan st...,perusahaan umum pembangunan perumahan nasional,sales relationship pemahaman & pengembangan st...
5816,3e83727f-005c-11ef-8f19-ae84a611b5f1,site engineer pengawasan serta monitoring pemb...,perusahaan umum pembangunan perumahan nasional,site engineer pengawasan serta monitoring pemb...
5817,310433fc-0073-11ef-9733-ca7bef2c0196,marketing officer strategi pemasaran efektif :...,perusahaan umum pembangunan perumahan nasional,marketing officer strategi pemasaran efektif :...
5818,686efb0a-0079-11ef-839c-1e2e912d270c,marketing analysis officer pengembangan proper...,perusahaan umum pembangunan perumahan nasional,marketing analysis officer pengembangan proper...


### **Remove Non-Alphanumeric Character**

In [121]:
import re

In [123]:
data['result_remove_non_alphanumeric_character_mitra_name'] = data['result_remove_html_mitra_name'].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))
data['result_remove_non_alphanumeric_character'] = data['result_remove_html_tags'].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))
data[['id', 'result_remove_html_tags', 'result_remove_non_alphanumeric_character_mitra_name', 'result_remove_non_alphanumeric_character']]

Unnamed: 0,id,result_remove_html_tags,result_remove_non_alphanumeric_character_mitra_name,result_remove_non_alphanumeric_character
0,190c81c5-048a-11ef-9e17-0a54edb93563,layanan publik public relations services manag...,dinas lingkungan hidup dan kehutanan daerah is...,layanan publik public relations services manag...
1,1b162247-048d-11ef-bb03-ceeddaa1b367,pengelola arsip layanan pengelolaan kearsipan ...,dinas lingkungan hidup dan kehutanan daerah is...,pengelola arsip layanan pengelolaan kearsipan ...
2,f2b8c752-04a0-11ef-8394-36f764739585,library quality analysis staff and integrated ...,dinas lingkungan hidup dan kehutanan daerah is...,library quality analysis staff and integrated ...
3,90665e71-0490-11ef-bb7f-6ac06bc5928b,staf manajemen licensing and management admini...,dinas lingkungan hidup dan kehutanan daerah is...,staf manajemen licensing and management admini...
4,60636725-0494-11ef-9e17-0a54edb93563,full-stack developer information technology sy...,dinas lingkungan hidup dan kehutanan daerah is...,fullstack developer information technology sys...
...,...,...,...,...
5815,79eaa62e-0363-11ef-98a1-c2fa75b94ea6,sales relationship pemahaman & pengembangan st...,perusahaan umum pembangunan perumahan nasional,sales relationship pemahaman pengembangan str...
5816,3e83727f-005c-11ef-8f19-ae84a611b5f1,site engineer pengawasan serta monitoring pemb...,perusahaan umum pembangunan perumahan nasional,site engineer pengawasan serta monitoring pemb...
5817,310433fc-0073-11ef-9733-ca7bef2c0196,marketing officer strategi pemasaran efektif :...,perusahaan umum pembangunan perumahan nasional,marketing officer strategi pemasaran efektif ...
5818,686efb0a-0079-11ef-839c-1e2e912d270c,marketing analysis officer pengembangan proper...,perusahaan umum pembangunan perumahan nasional,marketing analysis officer pengembangan proper...


### **Tokenization**

In [124]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aszay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [125]:
data['result_tokenization'] = data['result_remove_non_alphanumeric_character'].apply(lambda x: word_tokenize(x))
data[['id', 'result_remove_non_alphanumeric_character', 'result_tokenization']]

Unnamed: 0,id,result_remove_non_alphanumeric_character,result_tokenization
0,190c81c5-048a-11ef-9e17-0a54edb93563,layanan publik public relations services manag...,"[layanan, publik, public, relations, services,..."
1,1b162247-048d-11ef-bb03-ceeddaa1b367,pengelola arsip layanan pengelolaan kearsipan ...,"[pengelola, arsip, layanan, pengelolaan, kears..."
2,f2b8c752-04a0-11ef-8394-36f764739585,library quality analysis staff and integrated ...,"[library, quality, analysis, staff, and, integ..."
3,90665e71-0490-11ef-bb7f-6ac06bc5928b,staf manajemen licensing and management admini...,"[staf, manajemen, licensing, and, management, ..."
4,60636725-0494-11ef-9e17-0a54edb93563,fullstack developer information technology sys...,"[fullstack, developer, information, technology..."
...,...,...,...
5815,79eaa62e-0363-11ef-98a1-c2fa75b94ea6,sales relationship pemahaman pengembangan str...,"[sales, relationship, pemahaman, pengembangan,..."
5816,3e83727f-005c-11ef-8f19-ae84a611b5f1,site engineer pengawasan serta monitoring pemb...,"[site, engineer, pengawasan, serta, monitoring..."
5817,310433fc-0073-11ef-9733-ca7bef2c0196,marketing officer strategi pemasaran efektif ...,"[marketing, officer, strategi, pemasaran, efek..."
5818,686efb0a-0079-11ef-839c-1e2e912d270c,marketing analysis officer pengembangan proper...,"[marketing, analysis, officer, pengembangan, p..."


### **Remove Stopwords**

In [126]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aszay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [127]:
english_stopwords = stopwords.words('english')

stopword_factory = StopWordRemoverFactory()
list_stopwords = ArrayDictionary(stopword_factory.get_stop_words() + english_stopwords)


stopword = StopWordRemover(list_stopwords)

In [128]:
def remove_stopwords(text):
    filtered_words = [stopword.remove(word) for word in text]
    return filtered_words

# Apply remove_stopwords to the 'result_tokenization' column
data['result_remove_stopwords'] = data['result_tokenization'].apply(remove_stopwords)

data['result_remove_stopwords'] = data['result_remove_stopwords'].apply(lambda x: ' '.join(x))

data[['id', 'result_tokenization', 'result_remove_stopwords']]

Unnamed: 0,id,result_tokenization,result_remove_stopwords
0,190c81c5-048a-11ef-9e17-0a54edb93563,"[layanan, publik, public, relations, services,...",layanan publik public relations services manag...
1,1b162247-048d-11ef-bb03-ceeddaa1b367,"[pengelola, arsip, layanan, pengelolaan, kears...",pengelola arsip layanan pengelolaan kearsipan ...
2,f2b8c752-04a0-11ef-8394-36f764739585,"[library, quality, analysis, staff, and, integ...",library quality analysis staff integrated pub...
3,90665e71-0490-11ef-bb7f-6ac06bc5928b,"[staf, manajemen, licensing, and, management, ...",staf manajemen licensing management administr...
4,60636725-0494-11ef-9e17-0a54edb93563,"[fullstack, developer, information, technology...",fullstack developer information technology sys...
...,...,...,...
5815,79eaa62e-0363-11ef-98a1-c2fa75b94ea6,"[sales, relationship, pemahaman, pengembangan,...",sales relationship pemahaman pengembangan stra...
5816,3e83727f-005c-11ef-8f19-ae84a611b5f1,"[site, engineer, pengawasan, serta, monitoring...",site engineer pengawasan monitoring pembangun...
5817,310433fc-0073-11ef-9733-ca7bef2c0196,"[marketing, officer, strategi, pemasaran, efek...",marketing officer strategi pemasaran efektif p...
5818,686efb0a-0079-11ef-839c-1e2e912d270c,"[marketing, analysis, officer, pengembangan, p...",marketing analysis officer pengembangan proper...


### **Stemming**

In [143]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [130]:
stemmer = StemmerFactory().create_stemmer()

data['result_stemming'] = data['result_remove_stopwords'].apply(lambda x: stemmer.stem(x))
data[['id', 'result_remove_stopwords', 'result_stemming']]

Unnamed: 0,id,result_remove_stopwords,result_stemming
0,190c81c5-048a-11ef-9e17-0a54edb93563,layanan publik public relations services manag...,layan publik public relations services managem...
1,1b162247-048d-11ef-bb03-ceeddaa1b367,pengelola arsip layanan pengelolaan kearsipan ...,kelola arsip layan kelola arsip alih media ars...
2,f2b8c752-04a0-11ef-8394-36f764739585,library quality analysis staff integrated pub...,library quality analysis staff integrated publ...
3,90665e71-0490-11ef-bb7f-6ac06bc5928b,staf manajemen licensing management administr...,staf manajemen licensing management administra...
4,60636725-0494-11ef-9e17-0a54edb93563,fullstack developer information technology sys...,fullstack developer information technology sys...
...,...,...,...
5815,79eaa62e-0363-11ef-98a1-c2fa75b94ea6,sales relationship pemahaman pengembangan stra...,sales relationship paham kembang strategi bisn...
5816,3e83727f-005c-11ef-8f19-ae84a611b5f1,site engineer pengawasan monitoring pembangun...,site engineer awas monitoring bangun kawasan m...
5817,310433fc-0073-11ef-9733-ca7bef2c0196,marketing officer strategi pemasaran efektif p...,marketing officer strategi pasar efektif meta ...
5818,686efb0a-0079-11ef-839c-1e2e912d270c,marketing analysis officer pengembangan proper...,marketing analysis officer kembang properti ko...


### **Preprocessing Reviewing**

In [138]:
data['result_preprocessing'] = data['result_remove_non_alphanumeric_character_mitra_name'] + ' ' + data['result_stemming']

data['result_preprocessing'][0]

'dinas lingkungan hidup dan kehutanan daerah istimewa yogyakarta layan publik public relations services managementsubbagian umum kerjasama desain konten korespondensi tanggung jawab integritas olah data jurnalistik rencana publikasi informasi komunikasi'

### **Save Preprocessing**

In [139]:
data.to_csv('../data/cleaned_data.csv', index=False)

## **Data Weighting**

### **Vectorizer**

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [146]:
cleaned_data = pd.read_csv('../data/cleaned_data.csv')

In [147]:
tfidf_vectorizer = TfidfVectorizer()

X = tfidf_vectorizer.fit_transform(cleaned_data['result_preprocessing'])

X

<5820x6787 sparse matrix of type '<class 'numpy.float64'>'
	with 150543 stored elements in Compressed Sparse Row format>

## **Data Modelling**

In [148]:
from sklearn.metrics.pairwise import cosine_similarity

### **Content Based**

In [163]:
def content_based_recommendation(content_id, n=10):
    # Find the index corresponding to the content_id
    content_index = magang_opportunities.index[magang_opportunities['id'] == content_id].tolist()[0]

    similarity_score = cosine_similarity(X)
    sorted_similar_content = similarity_score[content_index].argsort()[::-1]

    top_n_content = sorted_similar_content[1:n+1]

    recommendation_result = pd.DataFrame(columns=['id', 'name', 'mitra', 'score'])

    print(f"Content magang: {magang_opportunities.loc[magang_opportunities['id'] == content_id, 'name'].iloc[0]} by {magang_opportunities.loc[magang_opportunities['id'] == content_id, 'mitra_name'].iloc[0]}")

    print(f"Top {n} Recommendation result: ")

    for i in top_n_content:
        score = similarity_score[content_index][i]
        if score != 0:  # Check if similarity score is not equal to 0
            recommendation_result = pd.concat([
                recommendation_result,
                pd.DataFrame({
                    'id': [magang_opportunities.iloc[i]['id']],
                    'name': [magang_opportunities.iloc[i]['name']],
                    'mitra': [magang_opportunities.iloc[i]['mitra_name']],
                    'score': [score]
                })
            ], ignore_index=True)

    return recommendation_result

In [173]:
content_based_recommendation('0a6ac79e-fbba-11ee-b3c3-42bfe4d30846', 5)

Content magang: Backend Programmer by PT Nusa Tekno Global
Top 5 Recommendation result: 


  recommendation_result = pd.concat([


Unnamed: 0,id,name,mitra,score
0,300da72f-fcc8-11ee-be48-1676ae1015eb,Frontend Programmer,PT Nusa Tekno Global,0.896082
1,17fa594c-fb88-11ee-82ee-42fdda477ac9,Backend Programmer,Lembaga Layanan Pendidikan Tinggi Wilayah IX,0.252772
2,d20fa959-fb00-11ee-82ee-42fdda477ac9,Back-End Developer,PT Bisa Artifisial Indonesia,0.240073
3,c014d2be-01fa-11ef-98a1-c2fa75b94ea6,Database Programmer,LEMBAGA LAYANAN PENDIDIKAN TINGGI WILAYAH X,0.24004
4,353c53bf-f624-11ee-af0a-42786e9b30fb,Back-End Developer,PT Bisa Artifisial Indonesia,0.237346


### **Query Based**

In [165]:
def query_based_recommendation(query, n=10):
    query = query.casefold()  # Make sure the query is in lowercase
    query_vector = tfidf_vectorizer.transform([query])  # Pass the query as a list
    
    similarity_score = cosine_similarity(query_vector, X)
    sorted_similar_content = similarity_score.argsort()[0][::-1]  # Reverse sort indices
    top_n_content = sorted_similar_content[1:n+1]

    recommendation_result = pd.DataFrame(columns=['id', 'name', 'mitra', 'score'])
    print(f"Query: {query}")
    print(f"Top {n} Recommendation result: ")
    for i in top_n_content:
        score = similarity_score[0][i]
        if score != 0:  # Check if similarity score is not equal to 0
            recommendation_result = pd.concat([
                recommendation_result,
                pd.DataFrame({
                    'id': [magang_opportunities.iloc[i]['id']],
                    'name': [magang_opportunities.iloc[i]['name']],
                    'mitra': [magang_opportunities.iloc[i]['mitra_name']],
                    'score': [score]
                })
            ], ignore_index=True)

    return recommendation_result

In [176]:
query_based_recommendation('backend web developer javascript html css pt bisa artifisial indonesia', 5)

Query: backend web developer javascript html css pt bisa artifisial indonesia
Top 5 Recommendation result: 


  recommendation_result = pd.concat([


Unnamed: 0,id,name,mitra,score
0,353c53bf-f624-11ee-af0a-42786e9b30fb,Back-End Developer,PT Bisa Artifisial Indonesia,0.398445
1,9e7ae1d8-f626-11ee-bf75-32cfa9fc582e,Back-End Developer,PT Bisa Artifisial Indonesia,0.393557
2,41afb8c6-f628-11ee-bf75-32cfa9fc582e,Back-End Developer,PT Bisa Artifisial Indonesia,0.393557
3,a85f0174-f693-11ee-af0a-42786e9b30fb,Back-End Developer,PT Bisa Artifisial Indonesia,0.393054
4,4b416c28-fc77-11ee-b3c3-42bfe4d30846,Full Stack Developer,PT Linkdataku Solusi Indonesia,0.361133
