# Modeling (CountVectorizer)

In [1]:
import pandas

In [2]:
text_list = [
    'ajar efektif kelas pintar semangat gratis',                                      # kalimat 1
    'pagi tetap semangat ajar aktivitas rabu pintar ayo simak jadwal acara',          # kalimat 2
    'susah sulit kerja tugas bingung tanya tanya kelas pintar akibat covid',          # kalimat 3
    'pagi susah kerja lama lama ajar jarak jauh penuh drama'                          # kalimat 4
]
kelas_teks = ['positif', 'positif', 'negatif' , 'negatif']

### Pembuatan list kata

In [3]:
text_word = []
for text in text_list:
    for word in text.split():
        text_word.append(word)
    
print(text_word)

['ajar', 'efektif', 'kelas', 'pintar', 'semangat', 'gratis', 'pagi', 'tetap', 'semangat', 'ajar', 'aktivitas', 'rabu', 'pintar', 'ayo', 'simak', 'jadwal', 'acara', 'susah', 'sulit', 'kerja', 'tugas', 'bingung', 'tanya', 'tanya', 'kelas', 'pintar', 'akibat', 'covid', 'pagi', 'susah', 'kerja', 'lama', 'lama', 'ajar', 'jarak', 'jauh', 'penuh', 'drama']


### Pembuatan fitur kata

In [4]:
# Mencari kata unik (fitur)
def get_uniqueWords(text_word) :
    unique_words = [] 
    for word in text_word:
        if not word in unique_words:
            unique_words.append(word)
    return unique_words

unique_words = get_uniqueWords(text_word)
print(unique_words)

['ajar', 'efektif', 'kelas', 'pintar', 'semangat', 'gratis', 'pagi', 'tetap', 'aktivitas', 'rabu', 'ayo', 'simak', 'jadwal', 'acara', 'susah', 'sulit', 'kerja', 'tugas', 'bingung', 'tanya', 'akibat', 'covid', 'lama', 'jarak', 'jauh', 'penuh', 'drama']


### Pembuatan wadah vektor kosong

In [5]:
# membuat vektor 0 dengan panjang = len(unique_words) & lebar = len(text_list)
def create_zeroVector():
    vector_list = [[0 for i in range(len(unique_words))] for j in range(len(text_list))]
    return vector_list

vector_list = create_zeroVector()
print(vector_list)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


### Pembuatan vektor

In [6]:
# Membuat list vector
def create_vectorList(text_list):
    for i, text in enumerate(text_list):
        for word in text.split():
            for j, unique in enumerate(unique_words):
                if word == unique:
                    vector_list[i][j] += 1
    return vector_list

vector_list = create_vectorList(text_list)
print(vector_list)

[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1]]


In [7]:
data_frame = pandas.DataFrame(vector_list, columns=unique_words, index=['kalimat ke-1','kalimat ke-2','kalimat ke-3','kalimat ke-4'])
data_frame['LABEL'] = kelas_teks

data_frame

Unnamed: 0,ajar,efektif,kelas,pintar,semangat,gratis,pagi,tetap,aktivitas,rabu,...,bingung,tanya,akibat,covid,lama,jarak,jauh,penuh,drama,LABEL
kalimat ke-1,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positif
kalimat ke-2,1,0,0,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,positif
kalimat ke-3,0,0,1,1,0,0,0,0,0,0,...,1,2,1,1,0,0,0,0,0,negatif
kalimat ke-4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,2,1,1,1,1,negatif


#### Menyimpan vector_list, kelas_teks, unique_words ke dalam model (.json)

## Modeling Selesai

-----------------------------------------------------------------------------------------------------------------------------

# Klasifikasi (K-Nearest Neighbors (KNN))

In [8]:
# data vector_list adalah model latih
data_latih = vector_list
print(data_latih)

[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1]]


In [9]:
text_list_uji = [
    'semangat ikut kelas pintar ajar jarak jauh tengah pandemi',
    'susah sulit ajar jarak jauh pandemi covid covid tetap semangat',
]
kelas_teks_uji = ['positif', 'negatif']

### Pembuatan wadah vektor uji kosong uji

In [10]:
# membuat vektor 0 dengan panjang = len(unique_words) & lebar = len(text_list)
def create_zeroVectorUji():
    vector_list = [[0 for i in range(len(unique_words))] for j in range(len(text_list_uji))]
    return vector_list

vector_list_uji = create_zeroVectorUji()
print(vector_list_uji)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


### Pembuatan vektor uji

In [11]:
# Membuat list vector
def create_vectorListUji(text_list):
    for i, text in enumerate(text_list):
        for word in text.split():
            for j, unique in enumerate(unique_words):
                if word == unique:
                    vector_list_uji[i][j] += 1
    return vector_list_uji

vector_list_uji = create_vectorListUji(text_list_uji)
print(vector_list_uji)

[[1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0]]


In [12]:
data_tes = vector_list_uji[1]

print(data_tes)

[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0]


### Perhitungan jarak dengan euclidean distance = sqrt(sum((x - y)^2))

In [13]:
print(data_latih, data_tes)

[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1]] [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0]


In [14]:
from math import sqrt

In [15]:
# wadah untuk tetangga-tetangga
neighbors = {}

for i in range(len(data_latih)):
    total = 0
    for j in range(len(data_tes)):
        total += pow((data_tes[j]-data_latih[i][j]), 2)
    neighbors[i] = sqrt(total)

neighbors

{0: 3.605551275463989, 1: 4.0, 2: 4.0, 3: 3.872983346207417}

In [16]:
import itertools

### Pengambilan K data tetangga terdekat

In [17]:
K = 3

# mencari K tetangga terdekat
def get_nearestNeighbors(neighbors):
    neighbors_sortASC = dict(sorted(neighbors.items(), key=lambda item: item[1]))
    nearest_neighbors = dict(itertools.islice(neighbors_sortASC.items(), K))
    return nearest_neighbors

nearest_neighbors = get_nearestNeighbors(neighbors)
nearest_neighbors

{0: 3.605551275463989, 3: 3.872983346207417, 1: 4.0}

### Pencarian label dari K data tetangga terdekat

In [18]:
index_nearestNeighbors = list(nearest_neighbors)

sentiment_nearestNeighbors = []
for index in index_nearestNeighbors:
    sentiment_nearestNeighbors.append(kelas_teks[index])
    
sentiment_nearestNeighbors

['positif', 'negatif', 'positif']

### Probabilitas tiap sentimen

In [19]:
count_positif = 0
count_negatif = 0
for sentiment in sentiment_nearestNeighbors:
    if sentiment == 'positif':
        count_positif += 1
    else:
        count_negatif +=1

probabilitas_positif = count_positif / len(sentiment_nearestNeighbors)
probabilitas_negatif = count_negatif / len(sentiment_nearestNeighbors)

probabilitas_positif, probabilitas_negatif

(0.6666666666666666, 0.3333333333333333)

### Pemberian label data uji

In [20]:
sentiment = ''
if probabilitas_positif > probabilitas_negatif:
    sentiment = 'positif'
else:
    sentiment = 'negatif'
    
sentiment

'positif'

In [21]:
text_list = [
    'ajar efektif kelas pintar semangat gratis',                                      # kalimat 1
    'pagi tetap semangat ajar aktivitas rabu pintar ayo simak jadwal acara',          # kalimat 2
    'susah sulit kerja tugas bingung tanya tanya kelas pintar akibat covid',          # kalimat 3
    'pagi susah kerja lama lama ajar jarak jauh penuh drama'                          # kalimat 4
]
kelas_teks = ['positif', 'positif', 'negatif' , 'negatif']

In [22]:
text_data = text_list
text_data.append(text_list_uji[0])
kelas_data = kelas_teks
kelas_data.append(kelas_teks_uji[0])

data_frame = pandas.DataFrame(text_data, columns=['dokumen'], index=['kalimat ke-1','kalimat ke-2','kalimat ke-3','kalimat ke-4','kalimat ke-5'])
data_frame['LABEL'] = kelas_data
data_frame['LABEL'][4] = [sentiment]

data_frame

Unnamed: 0,dokumen,LABEL
kalimat ke-1,ajar efektif kelas pintar semangat gratis,positif
kalimat ke-2,pagi tetap semangat ajar aktivitas rabu pintar...,positif
kalimat ke-3,susah sulit kerja tugas bingung tanya tanya ke...,negatif
kalimat ke-4,pagi susah kerja lama lama ajar jarak jauh pen...,negatif
kalimat ke-5,semangat ikut kelas pintar ajar jarak jauh ten...,[positif]
