In [17]:
import numpy as np
import csv
import chardet
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity between two texts
def calculate_cosine_similarity(text1, text2, threshold=0.6):
    if text1 is None or text2 is None:
        return 0.0
    
    vectorizer = CountVectorizer(min_df=1).fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))
    
    if similarity[0][0] >= threshold:
        return similarity[0][0]
    else:
        return 0.0

def read_csv_data(file_path):
    data = []
    column_names = []
    with open(file_path, 'r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        column_names = csv_reader.fieldnames
        for row in csv_reader:
            data.append(row)
    return data, column_names

def get_row_by_index(data, index):
    if index < len(data):
        return data[index]
    return None

def get_columns_by_index(row, column_names):
    selected_columns = [row[column_name] for column_name in column_names]
    return selected_columns

# Asking User Input
user_input = input("Search: ")

# Retrieve extraction data and column names from CSV file
csv_data, column_names = read_csv_data('Hasil_Ekstraksi_Clean.csv')

# Calculate cosine similarity between each extracted data and user input
similarities = []
for row in csv_data:
    similarity = calculate_cosine_similarity(row['Keywords'], user_input)
    similarities.append(similarity)

# Display the largest cosine similarity result
max_similarity = np.max(similarities)
max_index = np.argmax(similarities)
most_similar_row = get_row_by_index(csv_data, max_index)

columns_to_display = ['Name']  # Nama kolom yang ingin ditampilkan
selected_columns = get_columns_by_index(most_similar_row, columns_to_display)

if selected_columns is not None:
    print("Cosine similarity terbesar:", max_similarity)
    print("Baris yang terkait:")
    for column_name, value in zip(columns_to_display, selected_columns):
        print(f"{column_name}: {value}")


Search: wifi
Cosine similarity terbesar: 0.6933752452815365
Teks yang paling mirip: {'Name': 'Kopi Ireng Ndeso', 'Place ID': 'ChIJT8D01Rko1i0Rou4UsZMsDqs', 'Formatted Address': 'Jl. A. Yani No.190, Bokor, Turen, Kec. Turen, Kabupaten Malang, Jawa Timur 65175, Indonesia', 'Category': 'cafe, store, point_of_interest, food, establishment', 'rating': '4.4', 'total_reviews': '75', 'Price Level': '1.0', 'Review Text': 'Its suitable for hanging out while playing games los wifi\n\nCheap and cheerful free wifi perfect for relaxing enjoying delicious cheap coffee Toilets are also available\n\nIts good for hanging out with friends wifi is also smooth Thats great \n\nDrinking coffee first here wont regret it always want to drink coffee all the time until morning guaranteed coffee is always healthy and reliable from trusted coffee bean seeds\n\nP black coffee one brother', 'rating_normalized': '8.500.000.000.000.000', 'reviews_normalized': '42.841.428.819.544', 'score': '5.962.852.428.645.860', 'ra