# Modeling

In [1]:
import pandas as pd
import numpy as np
import gdown

## Dataset

In [4]:
# download data
nama_data = 'datasummary.csv'
gdown.download(f'https://drive.google.com/uc?id=10LSgIUgCpAwfxNNxKvtd9NuR2Djq95RQ', nama_data, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=10LSgIUgCpAwfxNNxKvtd9NuR2Djq95RQ
To: /content/datasummary.csv
100%|██████████| 1.68M/1.68M [00:00<00:00, 127MB/s]


'datasummary.csv'

In [5]:
data = pd.read_csv("datasummary.csv")
data

Unnamed: 0,Summary,Label
0,"sebelumnya dalam rapat tersebut, jimly menjela...",politik
1,"kalau melihat rapat pertamanya, ada harapan in...",politik
2,sedangkan pasangan prabowogibran diusung oleh ...,politik
3,sedangkan pasangan prabowogibran diusung dan d...,politik
4,saya berharap knpi menjadi magnet perjuangan k...,politik
...,...,...
632,"sementara itu, kontingen indonesia hingga kini...",olahraga
633,baca juga tim indonesia bawa keragaman budaya ...,olahraga
634,medali perak sekaligus medali pertama untuk in...,olahraga
635,"sementara itu, asian para games 2022 hangzhou ...",olahraga


## VSM

### Binary

> Binary merupakan sebuah pengolahan data yang bertujuan agar dapat digunakan oleh algoritma machine learning. Binary akan mengkonversi kata unik menjadi vektor binner yaitu 1 0.



In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

### TF-IDF



> metode yang digunakan dalam pemrosesan bahasa alami (Natural Language Processing - NLP) dan pengambilan informasi (Information Retrieval) untuk mengukur seberapa penting suatu kata dalam suatu dokumen atau kumpulan dokumen. Tujuannya adalah untuk memberikan bobot yang lebih tinggi kepada kata-kata yang penting dalam representasi teks.



In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# Inisialisasi TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Melakukan transformasi TF-IDF pada kolom 'final_abstrak'
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Summary'])

# Membuat DataFrame dari hasil TF-IDF
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Cetak DataFrame TF-IDF
tfidf_df

Unnamed: 0,00,000,000an,000kg,001,002,003,005,008,009,...,zulfikareditor,zulfira,zulkifli,zulkilfi,zumba,zumrotun,zuni,zuxian,zuxianyang,zverev
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
