#Define Libraries

In [1]:
import numpy as np
import csv
import chardet
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Calculate the cosine similarity between two texts

def calculate_cosine_similarity(text1, text2, threshold=0.6):
    if text1 is None or text2 is None:
        return 0.0
    
    vectorizer = CountVectorizer(min_df=1).fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))
    
    if similarity[0][0] >= threshold:
        return similarity[0][0]
    else:
        return 0.0

In [3]:
#Reading csv

def read_csv_data(file_path, column_name):
    data = []
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        encoding = chardet.detect(raw_data)['encoding']
        decoded_data = raw_data.decode(encoding)
        csv_reader = csv.DictReader(decoded_data.splitlines())
        for row in csv_reader:
            keyword = row[column_name]
            data.append(keyword)
    return data

In [4]:
def get_row_by_index(file_path, index, encoding='utf-8'):
    with open(file_path, 'r', encoding=encoding) as file:
        csv_reader = csv.reader(file)
        rows = list(csv_reader)
        if index < len(rows):
            return rows[index]
    return None

In [5]:
# Fungsi untuk mendapatkan kolom tertentu dari baris dalam file CSV berdasarkan indeks
def get_columns_by_index(file_path, index, columns, encoding='utf-8'):
    row = get_row_by_index(file_path, index, encoding)
    if row is not None:
        selected_columns = [row[column_index] for column_index in columns]
        return selected_columns
    return None



In [6]:
#Asking User Input

user_input = input("Search: ")

Search: milk


In [7]:
#Retrieve extraction data from CSV file
csv_data = read_csv_data('Hasil_Ekstraksi_Clean.csv', 'Keywords')

In [8]:
#Calculate cosine similarity between each extracted data and user input
similarities = []
for data in csv_data:
    similarity = calculate_cosine_similarity(data, user_input)
    similarities.append(similarity)

In [9]:
#Display the largest cosine similarity result

max_similarity = np.max(similarities)
max_index = np.argmax(similarities)
most_similar_text = csv_data[max_index]
most_similar_row = get_row_by_index('Hasil_Ekstraksi_Clean.csv', max_index)

#print("Cosine similarity terbesar:", max_similarity)
#print("Teks yang paling mirip:", most_similar_text)
#print("Baris yang terkait:")
#print(most_similar_row)

In [10]:

columns_to_display = [0, 1, 2]  
selected_columns = get_columns_by_index('Hasil_Ekstraksi_Clean.csv', max_index, columns_to_display)

if selected_columns is not None:
    print("Cosine similarity terbesar:", max_similarity)
    print("Teks yang paling mirip:", most_similar_text)
    print("Kolom yang terkait:")
    #for column_index, value in zip(columns_to_display, selected_columns):
        #column_name = csv_data[0][column_index]  # Mendapatkan nama kolom dari baris header
        #print(f"{column_name}: {value}")
    for column_name, value in zip(columns_to_display, selected_columns):
        print(f"{column_name}: {value}")



Cosine similarity terbesar: 0.0
Teks yang paling mirip: implies pujon, area cafe, cafe downside, pujon kidul, cafe enjoy, located pujon, cafe rides, sawah pujon, cafe sawah, kidul cafes
Kolom yang terkait:
0: Name
1: Place ID
2: Formatted Address
