In [2]:
import csv
import chardet
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Step 1: Load GloVe word embeddings
embeddings_index = {}
with open('/content/drive/MyDrive/For Capstone/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [6]:
# Step 2: Create embedding matrix and vocabulary
embedding_dim = 100  # Dimensionality of the word embeddings
embedding_matrix = np.zeros((len(embeddings_index), embedding_dim))
vocab = []

for i, word in enumerate(embeddings_index):
    embedding_vector = embeddings_index[word]
    embedding_matrix[i] = embedding_vector
    vocab.append(word)

In [7]:
# Step 3: Convert embedding matrix to TensorFlow embedding
embedding_layer = tf.keras.layers.Embedding(
    len(embeddings_index),
    embedding_dim,
    weights=[embedding_matrix],
    trainable=False
)

In [8]:
# Step 4: Test the TensorFlow embedding
word_input = tf.keras.Input(shape=(1,), dtype=tf.int32)
embedding_output = embedding_layer(word_input)


In [9]:
# Step 5: Build and compile a model that uses the embedding layer
model = tf.keras.models.Sequential([
    embedding_layer,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [12]:
# Step 6: Prepare the search data
search_data = []
for word in vocab:
    search_data.append(embeddings_index[word])

search_data = np.array(search_data)


In [13]:
# Step 7: Define the search function
def search(query, top_k=5):
    top_words_list = []
    for query_word in query:
        query_tokens = query_word.split()
        query_embedding = np.mean([embeddings_index[token] for token in query_tokens if token in embeddings_index], axis=0)
        similarity_scores = cosine_similarity([query_embedding], search_data)
        similarity_scores = similarity_scores.reshape(-1)
        top_indices = similarity_scores.argsort()[-top_k:][::-1]
        top_words = [vocab[i] for i in top_indices]
        top_words_list.append(top_words)
    return top_words_list


In [20]:
stop_words = set(stopwords.words('english'))

# Words to remove
additional_keywords = ["caffe", "place", "coffee", "nan", "cafe"]

# Asking User Input
user_input = input("Search: ")

# Split the input into individual words
words = user_input.split()

# Preprocess the search query by splitting and removing symbols
search_keywords = re.findall(r'\b\w+\b', user_input.lower())

# Remove stop words and additional keywords from the search keywords
search_keywords = [word for word in search_keywords if word not in stop_words and word not in additional_keywords]

# Join the remaining words back together

print("Filtered Input:", search_keywords)

Search: cozy, affordable, cheap
Filtered Input: ['cozy', 'affordable', 'cheap']


In [23]:
top_words_list = search(search_keywords, top_k=10)

list_of_words = []
for i, query_word in enumerate(search_keywords):
    print(f"Closest meanings for query '{query_word}':")
    print(top_words_list[i])
    list_of_words.append(top_words_list[i])
    print()
list_of_words  = [word for sublist in list_of_words for word in sublist]
# list_of_words = list_of_words[0:len(query)]
print('List closest keyword from query: \n', list_of_words)

Closest meanings for query 'cozy':
['cozy', 'cosy', 'comfy', 'homey', 'cramped', 'dingy', 'spacious', 'clubby', 'rustic', 'shabby']

Closest meanings for query 'affordable':
['affordable', 'inexpensive', 'cheap', 'expensive', 'cheaper', 'efficient', 'unaffordable', 'accessible', 'priced', 'environmentally']

Closest meanings for query 'cheap':
['cheap', 'inexpensive', 'cheaper', 'expensive', 'affordable', 'easy', 'plentiful', 'pricey', 'buying', 'scarce']

List closest keyword from query: 
 ['cozy', 'cosy', 'comfy', 'homey', 'cramped', 'dingy', 'spacious', 'clubby', 'rustic', 'shabby', 'affordable', 'inexpensive', 'cheap', 'expensive', 'cheaper', 'efficient', 'unaffordable', 'accessible', 'priced', 'environmentally', 'cheap', 'inexpensive', 'cheaper', 'expensive', 'affordable', 'easy', 'plentiful', 'pricey', 'buying', 'scarce']


In [50]:
def search_keywords(csv_file, keywords, column):

  with open(csv_file, 'r') as f:
    reader = csv.reader(f)
    rows = []
    for row in reader:
      rows.append(row)

  row_numbers = []
  for row in rows:
    for keyword in keywords:
      if keyword in row[column]:
        row_numbers.append(rows.index(row))

  return row_numbers

if __name__ == '__main__':
  csv_file = '/content/drive/MyDrive/For Capstone/Place Detail - Hasil_Ekstraksi(One Keyword) + Reranked.csv'
  keywords = list_of_words
  column = 13

  row_numbers = search_keywords(csv_file, keywords, column)
  unique_list = list(set(row_numbers))
  sorted_list = sorted(unique_list)
  Place_list= sorted_list[:20]
  print(Place_list)


[2, 7, 10, 11, 14, 15, 23, 24, 25, 26, 28, 29, 32, 33, 34, 35, 36, 37, 41, 47]


In [51]:
columns_to_extract = [0, 2, 4]

with open(csv_file, 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row

    def get_data(row_numbers, column_numbers):
        data = []
        for i, row in enumerate(reader):
            if i + 1 in row_numbers:
                row_data = [row[col] for col in column_numbers]
                data.append(row_data)
        return data

    data = get_data(Place_list, columns_to_extract)
    for row in data:
        print(row)

['Cokelat Klasik Cafe', 'Jalan Joyo Agung, Merjosari, Lowokwaru, Tlogomas, Kec. Lowokwaru, Kota Malang, Jawa Timur 65144, Indonesia', '4.4']
['Baraka Coffee House', 'Jl. Watumujur II No.6, Ketawanggede, Kec. Lowokwaru, Kota Malang, Jawa Timur 65145, Indonesia', '5.0']
['My Kopi O!', 'Jl. Tenes No.14, Kauman, Kec. Klojen, Kota Malang, Jawa Timur 65119, Indonesia', '4.6']
['Lafayette Coffee & Eatery', 'Jl. Semeru No.2, RW.4, Oro-oro Dowo, Kec. Klojen, Kota Malang, Jawa Timur 65119, Indonesia', '4.6']
['Noise Coffee', 'Jl. Sulfat, Pandanwangi, Kec. Blimbing, Kota Malang, Jawa Timur, Indonesia', '5.0']
['Bendino Beverages', 'Fisip, Universitas Brawijaya, Ketawanggede, Kec. Lowokwaru, Kota Malang, Jawa Timur 65145, Indonesia', '5.0']
['Kuma Bake and Coffee', 'Kantin Fakultas Teknologi Pertanian Universitas Brawijaya, Jl. Veteran, Ketawanggede, Kec. Lowokwaru, Kota Malang, Jawa Timur 65145, Indonesia', '5.0']
['FF GARDENSPACE', 'Jl. Karanglo Indah Atas No.12, Jajar, Tanjungtirto, Kec. Singos

In [52]:
model.save('Word_Embedding.h5')

In [53]:
# Load your trained TensorFlow model
model = tf.keras.models.load_model('Word_Embedding.h5')  # Replace 'your_model.h5' with your model file

# Convert the model to JSON
model_json = model.to_json()

# Save the JSON model to a file
with open('Word_Embedding.json', 'w') as json_file:
    json_file.write(model_json)