# **Data Cleaning**

In [1]:
import pandas as pd

In [46]:
dataset = pd.read_csv('dataset.csv')
dataset

Unnamed: 0,id,name,description,skills
0,1,Back-end Developer Internship,Job Descriptions Merancang dan membuat API den...,
1,2,Website Developer,<p><strong>Kualifikasi :</strong>&nbsp;</p><ul...,"['Komunikasi', 'Kreatif', 'Pemahaman Industri'..."
2,3,Software Architect Intern,<p>To help architect team on demonstration of ...,"['Komunikasi', 'Kreatif', 'Desain Grafis', 'Co..."


## **Combine Data**

In [47]:
import ast

In [48]:
dataset['skills'] = dataset['skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else '')

dataset['skills'] = dataset['skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

dataset['combined_text'] = dataset['name'] + ' ' + dataset['skills'].astype(str) + ' ' + dataset['description']

dataset['combined_text']

0    Back-end Developer Internship  Job Description...
1    Website Developer Komunikasi, Kreatif, Pemaham...
2    Software Architect Intern Komunikasi, Kreatif,...
Name: combined_text, dtype: object

In [49]:
print(dataset['combined_text'][1])

Website Developer Komunikasi, Kreatif, Pemahaman Industri, Manajemen Proyek, Kreativitas, Kerjasama Tim, Keahlian Teknologi, Pemasaran Produk <p><strong>Kualifikasi :</strong>&nbsp;</p><ul><li>Wajib memiliki laptop sendiri.</li><li>Mampu membuat website dan landing page sesuai dengan kebutuhan perusahaan </li><li>Mampu  Optimasi SEO</li><li>Masa magang minimum 3 bulan dengan jam kerja normal</li><li>Harus menyertakan portofolio</li></ul><p><br></p><p><br></p><p><strong>Benefit :</strong></p><ul><li>Sertifikat Magang&nbsp;</li><li>Konversi Nilai</li><li>Sistem WFH</li></ul><p><strong>Kualifikasi :</strong>&nbsp;</p><ul><li>Wajib memiliki laptop sendiri.</li><li>Mampu membuat website dan landing page sesuai dengan kebutuhan perusahaan </li><li>Mampu  Optimasi SEO</li><li>Masa magang minimum 3 bulan dengan jam kerja normal</li><li>Harus menyertakan portofolio</li></ul><p><br></p><p><br></p><p><strong>Benefit :</strong></p><ul><li>Sertifikat Magang&nbsp;</li><li>Konversi Nilai</li><li>Sist

## **Case Folding**

In [50]:
dataset['result_case_folding_data'] = dataset['combined_text'].apply(lambda x: x.casefold())
dataset[['id', 'combined_text','result_case_folding_data']]

Unnamed: 0,id,combined_text,result_case_folding_data
0,1,Back-end Developer Internship Job Description...,back-end developer internship job description...
1,2,"Website Developer Komunikasi, Kreatif, Pemaham...","website developer komunikasi, kreatif, pemaham..."
2,3,"Software Architect Intern Komunikasi, Kreatif,...","software architect intern komunikasi, kreatif,..."


In [51]:
print(dataset['result_case_folding_data'][1])

website developer komunikasi, kreatif, pemahaman industri, manajemen proyek, kreativitas, kerjasama tim, keahlian teknologi, pemasaran produk <p><strong>kualifikasi :</strong>&nbsp;</p><ul><li>wajib memiliki laptop sendiri.</li><li>mampu membuat website dan landing page sesuai dengan kebutuhan perusahaan </li><li>mampu  optimasi seo</li><li>masa magang minimum 3 bulan dengan jam kerja normal</li><li>harus menyertakan portofolio</li></ul><p><br></p><p><br></p><p><strong>benefit :</strong></p><ul><li>sertifikat magang&nbsp;</li><li>konversi nilai</li><li>sistem wfh</li></ul><p><strong>kualifikasi :</strong>&nbsp;</p><ul><li>wajib memiliki laptop sendiri.</li><li>mampu membuat website dan landing page sesuai dengan kebutuhan perusahaan </li><li>mampu  optimasi seo</li><li>masa magang minimum 3 bulan dengan jam kerja normal</li><li>harus menyertakan portofolio</li></ul><p><br></p><p><br></p><p><strong>benefit :</strong></p><ul><li>sertifikat magang&nbsp;</li><li>konversi nilai</li><li>sist

## **Remove Html Tags**

In [52]:
from lxml import etree

In [55]:
def remove_html_tags(text):
    parser = etree.HTMLParser()
    tree = etree.fromstring(text, parser)
    return etree.tostring(tree, encoding='unicode', method='text')

dataset['result_remove_html_tags'] = dataset['result_case_folding_data'].apply(remove_html_tags)
dataset[['id', 'result_case_folding_data' ,'result_remove_html_tags']]

Unnamed: 0,id,result_case_folding_data,result_remove_html_tags
0,1,back-end developer internship job description...,back-end developer internship job description...
1,2,"website developer komunikasi, kreatif, pemaham...","website developer komunikasi, kreatif, pemaham..."
2,3,"software architect intern komunikasi, kreatif,...","software architect intern komunikasi, kreatif,..."


In [56]:
print(dataset['result_remove_html_tags'][1])

website developer komunikasi, kreatif, pemahaman industri, manajemen proyek, kreativitas, kerjasama tim, keahlian teknologi, pemasaran produk kualifikasi : wajib memiliki laptop sendiri.mampu membuat website dan landing page sesuai dengan kebutuhan perusahaan mampu  optimasi seomasa magang minimum 3 bulan dengan jam kerja normalharus menyertakan portofoliobenefit :sertifikat magang konversi nilaisistem wfhkualifikasi : wajib memiliki laptop sendiri.mampu membuat website dan landing page sesuai dengan kebutuhan perusahaan mampu  optimasi seomasa magang minimum 3 bulan dengan jam kerja normalharus menyertakan portofoliobenefit :sertifikat magang konversi nilaisistem wfh


## **Remove Non-Alphanumeric Character**

In [63]:
import re

In [64]:
dataset['result_remove_non-alphanumeric_character'] = dataset['result_remove_html_tags'].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))
dataset[['id', 'result_remove_html_tags', 'result_remove_non-alphanumeric_character']]

Unnamed: 0,id,result_remove_html_tags,result_remove_non-alphanumeric_character
0,1,back-end developer internship job description...,backend developer internship job descriptions...
1,2,"website developer komunikasi, kreatif, pemaham...",website developer komunikasi kreatif pemahaman...
2,3,"software architect intern komunikasi, kreatif,...",software architect intern komunikasi kreatif d...


In [65]:
print(dataset['result_remove_non_alphanumeric_character'][1])

website developer komunikasi kreatif pemahaman industri manajemen proyek kreativitas kerjasama tim keahlian teknologi pemasaran produk kualifikasi wajib memiliki laptop sendirimampu membuat website dan landing page sesuai dengan kebutuhan perusahaan mampu  optimasi seomasa magang minimum 3 bulan dengan jam kerja normalharus menyertakan portofoliobenefit sertifikat magangkonversi nilaisistem wfhkualifikasi wajib memiliki laptop sendirimampu membuat website dan landing page sesuai dengan kebutuhan perusahaan mampu  optimasi seomasa magang minimum 3 bulan dengan jam kerja normalharus menyertakan portofoliobenefit sertifikat magangkonversi nilaisistem wfh


## **Tokenization**

In [66]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aszay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [67]:
dataset['result_tokenization'] = dataset['result_remove_non_alphanumeric_character'].apply(lambda x: word_tokenize(x))
dataset[['id', 'result_remove_non_alphanumeric_character', 'result_tokenization']]

Unnamed: 0,id,result_remove_non_alphanumeric_character,result_tokenization
0,1,backend developer internship job descriptions...,"[backend, developer, internship, job, descript..."
1,2,website developer komunikasi kreatif pemahaman...,"[website, developer, komunikasi, kreatif, pema..."
2,3,software architect intern komunikasi kreatif d...,"[software, architect, intern, komunikasi, krea..."


In [70]:
print(dataset['result_tokenization'][2])

['software', 'architect', 'intern', 'komunikasi', 'kreatif', 'desain', 'grafis', 'coding', 'to', 'help', 'architect', 'team', 'on', 'demonstration', 'of', 'an', 'improvement', 'idea', 'in', 'which', 'work', 'is', 'focused', 'on', 'determining', 'whether', 'an', 'idea', 'can', 'be', 'turned', 'into', 'a', 'realitythe', 'improvement', 'might', 'include', 'test', 'optimization', 'test', 'automation', 'framework', 'usage', 'eg', 'test', 'framework', 'or', 'design']


## **Remove Stopwords**

In [83]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [99]:
stopword = StopWordRemoverFactory().create_stop_word_remover()

In [100]:
def remove_stopwords(text):
    filtered_words = [stopword.remove(word) for word in text]
    return filtered_words

# Apply remove_stopwords to the 'result_tokenization' column
dataset['result_remove_stopwords'] = dataset['result_tokenization'].apply(remove_stopwords)

# Optionally, join the list of words back into a string if needed
dataset['result_remove_stopwords'] = dataset['result_remove_stopwords'].apply(lambda x: ' '.join(x))

dataset[['id', 'result_tokenization', 'result_remove_stopwords']]

Unnamed: 0,id,result_tokenization,result_remove_stopwords
0,1,"[backend, developer, internship, job, descript...",backend developer internship job descriptions ...
1,2,"[website, developer, komunikasi, kreatif, pema...",website developer komunikasi kreatif pemahaman...
2,3,"[software, architect, intern, komunikasi, krea...",software architect intern komunikasi kreatif d...


In [101]:
print(dataset['result_remove_stopwords'][0])

backend developer internship job descriptions merancang  membuat api  laravel memelihara  meningkatkan website berkolaborasi  frontend developers  meningkatkan kegunaan job requirements fresh graduate  mahasiswa tingkat akhir jurusan informatika sistem informasi  relevan bersedia magang selama 3 bulan  sistem wfa work from anywhere menguasai bahasa pemrogramman php khusunya memiliki pemahaman yg mendalam  laravel menguasai api application programming interface  mampu membuat restful api menggunakan laravel  komunikasi  berbagai komponen aplikasi memiliki skill komunikasi  baik berkomitmen  bertanggung jawab  mengikuti program magang hingga selesai


## **Stemming**

In [102]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [103]:
stemmer = StemmerFactory().create_stemmer()

dataset['result_stemming'] = dataset['result_remove_stopwords'].apply(lambda x: stemmer.stem(x))
dataset[['id', 'result_remove_stopwords', 'result_stemming']]

Unnamed: 0,id,result_remove_stopwords,result_stemming
0,1,backend developer internship job descriptions ...,backend developer internship job descriptions ...
1,2,website developer komunikasi kreatif pemahaman...,website developer komunikasi kreatif paham ind...
2,3,software architect intern komunikasi kreatif d...,software architect intern komunikasi kreatif d...


In [106]:
print(dataset['result_stemming'][0])

backend developer internship job descriptions rancang buat api laravel pelihara tingkat website kolaborasi frontend developers tingkat guna job requirements fresh graduate mahasiswa tingkat akhir jurus informatika sistem informasi relevan sedia magang lama 3 bulan sistem wfa work from anywhere kuasa bahasa pemrogramman php khusunya milik paham yg dalam laravel kuasa api application programming interface mampu buat restful api guna laravel komunikasi bagai komponen aplikasi milik skill komunikasi baik komitmen tanggung jawab ikut program magang hingga selesai


## **Vectorizer**

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [116]:
vectors = TfidfVectorizer().fit_transform(dataset['result_stemming'])
vectors.toarray()

vectors

<3x130 sparse matrix of type '<class 'numpy.float64'>'
	with 142 stored elements in Compressed Sparse Row format>

# **Modeling**

In [117]:
from sklearn.metrics.pairwise import cosine_similarity

In [121]:
similarity = cosine_similarity(vectors)
similarity[0][1:]

array([0.12441811, 0.01610906])

In [169]:
def recommend_by_content_based_filtering(item):
  item_index = dataset[dataset['id']==item].index[0]
  distances = similarity[item_index]
  item_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[0:]

  recommended_items = []

  for i in item_list:
    recommended_items.append([dataset.iloc[i[0]].id]+[i[1]])

  return recommended_items

In [170]:
recommend_by_content_based_filtering(1)

[[1, 0.9999999999999999], [2, 0.12441811254356129], [3, 0.016109062747582358]]