In [1]:
#Define Libraries
import numpy as np
import csv
import chardet
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
#Calculate the cosine similarity between two texts

def calculate_cosine_similarity(text1, text2, threshold=0.6):
    if text1 is None or text2 is None:
        return 0.0
    
    vectorizer = CountVectorizer(min_df=1).fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))
    
    if similarity[0][0] >= threshold:
        return similarity[0][0]
    else:
        return 0.0

In [3]:
#Reading csv

def read_csv_data(file_path, column_name):
    data = []
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        encoding = chardet.detect(raw_data)['encoding']
        decoded_data = raw_data.decode(encoding)
        csv_reader = csv.DictReader(decoded_data.splitlines())
        for row in csv_reader:
            keyword = row[column_name]
            data.append(keyword)
    return data

In [4]:
def get_row_by_index(file_path, index, encoding='utf-8'):
    with open(file_path, 'r', encoding=encoding) as file:
        csv_reader = csv.reader(file)
        rows = list(csv_reader)
        if index < len(rows):
            return rows[index]
    return None

In [5]:
# Fungsi untuk mendapatkan kolom tertentu dari baris dalam file CSV berdasarkan indeks
def get_columns_by_index(file_path, index, columns, encoding='utf-8'):
    rows = []
    with open(file_path, 'r', encoding=encoding) as file:
        csv_reader = csv.reader(file)
        rows = list(csv_reader)
    
    if index < len(rows):
        selected_columns = [rows[index][column_index] for column_index in columns]
        return selected_columns
    
    return None



In [15]:
stop_words = set(stopwords.words('english'))

# Words to remove
additional_keywords = ["caffe", "place", "coffee", "nan", "cafe"]

# Asking User Input
user_input = input("Search: ")

# Split the input into individual words
words = user_input.split()

# Preprocess the search query by splitting and removing symbols
search_keywords = re.findall(r'\b\w+\b', user_input.lower())

# Remove stop words and additional keywords from the search keywords
search_keywords = [word for word in search_keywords if word not in stop_words and word not in additional_keywords]

# Join the remaining words back together
filtered_input = " ".join(search_keywords)

print("Filtered Input:", filtered_input)

Search: wifi
Filtered Input: wifi


In [19]:
#Retrieve extraction data from CSV file
csv_data = read_csv_data('Place Detail - Hasil_Ekstraksi(One Keyword) + Reranked.csv', 'One_Keywords')

In [20]:
#Calculate cosine similarity between each extracted data and user input
similarities = []
for data in csv_data:
    similarity = calculate_cosine_similarity(data, filtered_input)
    similarities.append(similarity)

In [21]:
#Display the largest cosine similarity result
top_indices = np.argsort(similarities)[-8:][::-1]



# Sort top_indices based on the 'score' column
# sorted_indices = sorted(top_indices, key=lambda x: float(get_row_by_index('Hasil_Ekstraksi_Clean.csv', x)[10].replace(',', '')), reverse=True)
sorted_indices = sorted(top_indices, key=lambda x: float("".join(c for c in get_row_by_index('Place Detail - Hasil_Ekstraksi(One Keyword) + Reranked.csv', x)[9] if c.isdigit())), reverse=True)


for index in sorted_indices:
    max_similarity = similarities[index]
    most_similar_text = csv_data[index]
    most_similar_row = get_row_by_index('Place Detail - Hasil_Ekstraksi(One Keyword) + Reranked.csv', index)
    columns_to_display = [0, 1, 2, 4, 5]  
    selected_columns = get_columns_by_index('Place Detail - Hasil_Ekstraksi(One Keyword) + Reranked.csv', index, columns_to_display)

    if selected_columns is not None:
        print("Cosine similarity:", max_similarity)
        print("Teks yang paling mirip:", most_similar_text)
        print("Kolom yang terkait:")
        for column_index, value in zip(columns_to_display, selected_columns):
            column_name = csv_data[0][column_index]  # Retrieve the column name from the header row
            print(f"{column_name}: {value}")
        print("------------------------------")

Cosine similarity: 0.0
Teks yang paling mirip: relaxing, shop, suitable, gaeesss, casual, warung, moment
Kolom yang terkait:
c: Kopi Perempat (es kopi, tubruk kopi)
a: ChIJ_0dtc8op1i0Rz9SRlnTQHKs
f: Jl. Semeru No.16, Oro-oro Dowo, Kec. Klojen, Kota Malang, Jawa Timur 65119, Indonesia
s: 4.9
,: 8
------------------------------
Cosine similarity: 0.0
Teks yang paling mirip: parking, employees, coffee, comfortable, prices, losduk, thankyou
Kolom yang terkait:
c: Kopi Hompimpa
a: ChIJ-WrowqH51y0RDac3a0yY-lY
f: Jl. Ikan Tombro No.25, Mojolangu, Kec. Lowokwaru, Kota Malang, Jawa Timur 65142, Indonesia
s: 4.9
,: 8
------------------------------
Cosine similarity: 0.0
Teks yang paling mirip: menu, signature, work, plugs, delicious, suitable, hanging
Kolom yang terkait:
c: Tahwa Kurnia 4
a: ChIJdWHxIpEr1i0RzM2ehDiHDL4
f: 2MFC+PG8, Madyopuro, Kedungkandang, Malang City, East Java 65139, Indonesia
s: 4.9
,: 8
------------------------------
Cosine similarity: 0.0
Teks yang paling mirip: ginger, dr

In [67]:
columns_to_display = [0, 1, 2, 4, 5]  
selected_columns = get_columns_by_index('Hasil_Ekstraksi_Clean.csv', max_index, columns_to_display)

if selected_columns is not None:
    print("Cosine similarity terbesar:", max_similarity)
    print("Teks yang paling mirip:", most_similar_text)
    print("Kolom yang terkait:")
    for column_index, value in zip(columns_to_display, selected_columns):
        column_name = csv_data[0][column_index]  # Retrieve the column name from the header row
        print(f"{column_name}: {value}")


Cosine similarity terbesar: 0.0
Teks yang paling mirip: speed wifi, wifi connection, place cup, coffee stingy, cozy, brotherhood coffee, wifi guys, free wifi, coffee variation, cozy place
Kolom yang terkait:
i: Kopi Ireng Ndeso
m: ChIJT8D01Rko1i0Rou4UsZMsDqs
p: Jl. A. Yani No.190, Bokor, Turen, Kec. Turen, Kabupaten Malang, Jawa Timur 65175, Indonesia
i: 4.4
e: 75


In [10]:
for index in sorted_indices:
    max_similarity = similarities[index]
    if max_similarity >= 0.6:
        most_similar_text = csv_data[index]
        most_similar_row = get_row_by_index('Hasil_Ekstraksi_Clean.csv', index)
        selected_columns = get_columns_by_index('Hasil_Ekstraksi_Clean.csv', index, columns_to_display)

        if selected_columns is not None:
            print("Cosine similarity:", max_similarity)
            print("Teks yang paling mirip:", most_similar_text)
            print("Kolom yang terkait:")
            for column_index, value in zip(columns_to_display, selected_columns):
                column_name = csv_data[0][column_index]  # Retrieve the column name from the header row
                print(f"{column_name}: {value}")
            print("------------------------------")

Cosine similarity: 0.6933752452815365
Teks yang paling mirip: perfect relaxing, coffee brother, trusted coffee, los wifi, wifi smooth, coffee toilets, free wifi, wifi cheap, friends wifi, coffee healthy
Kolom yang terkait:
i: Kopi Ireng Ndeso
m: ChIJT8D01Rko1i0Rou4UsZMsDqs
p: Jl. A. Yani No.190, Bokor, Turen, Kec. Turen, Kabupaten Malang, Jawa Timur 65175, Indonesia
i: 4.4
e: 75
------------------------------
Cosine similarity: 0.6246950475544243
Teks yang paling mirip: speed wifi, wifi connection, place cup, coffee stingy, cozy, brotherhood coffee, wifi guys, free wifi, coffee variation, cozy place
Kolom yang terkait:
i: Kopi Jelata
m: ChIJhRhMJZb81y0RIwWtvzpwFXA
p: b, Jl. Mertojoyo Sel. No.17b, Merjosari, Kec. Lowokwaru, Kota Malang, Jawa Timur 65144, Indonesia
i: 4.5
e: 280
------------------------------
Cosine similarity: 0.6
Teks yang paling mirip: inexpensive, student pockets, group discussion, visitor cozy, free wifi, theres wifi, friends friendly, wifi mnc, nice place, studied 