In [1]:
import csv
import chardet
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Step 1: Load GloVe word embeddings
embeddings_index = {}
with open('/content/drive/MyDrive/For Capstone/Tensorflow Words Embedding/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [5]:
# Step 2: Create embedding matrix and vocabulary
embedding_dim = 100  # Dimensionality of the word embeddings
embedding_matrix = np.zeros((len(embeddings_index), embedding_dim))
vocab = []

for i, word in enumerate(embeddings_index):
    embedding_vector = embeddings_index[word]
    embedding_matrix[i] = embedding_vector
    vocab.append(word)

In [6]:
# Step 3: Convert embedding matrix to TensorFlow embedding
embedding_layer = tf.keras.layers.Embedding(
    len(embeddings_index),
    embedding_dim,
    weights=[embedding_matrix],
    trainable=False
)

In [7]:
# Step 4: Test the TensorFlow embedding
word_input = tf.keras.Input(shape=(1,), dtype=tf.int32)
embedding_output = embedding_layer(word_input)


In [8]:
# Step 5: Build and compile a model that uses the embedding layer
model = tf.keras.models.Sequential([
    embedding_layer,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [9]:
# Example

word = "machine"
word_index = vocab.index(word)
embedded_word = embedding_matrix[word_index]
print(embedded_word)

[-0.65364999  0.49419001 -0.26245001 -0.20722    -0.11413     0.35701001
  1.04540002  0.21881001  0.52768999  0.60606003  0.42521    -0.65169001
  0.15318    -0.14797001  0.12650999 -0.017124    0.45324999  0.37165999
 -0.26846999 -0.26269999  0.43869001 -0.016615    0.12714    -0.54707998
  0.089084    0.24336    -0.34415001  0.0026505  -0.094268    0.056114
  0.46366     0.68786001 -0.20631    -0.088003    0.32153001 -0.91399002
 -0.080976   -0.90761     0.92888999 -0.68032998  0.23801    -0.37469
 -0.43278    -0.19243    -0.23711    -0.73040998 -0.50591999 -0.30237001
  0.0017281  -0.60922998 -0.21046001  0.47402999  0.37333     1.24749994
  0.62989998 -1.52919996 -0.32403001  0.59680998  0.97994     0.59755999
  0.67624998  0.28222999 -0.26747999  1.42499995 -0.34419     0.25211999
  0.30239999 -0.26582    -0.22583     0.53783    -0.44439    -0.24281
  0.38001001  0.085317    0.49693999  0.24058001  0.20611     0.023896
 -0.53078002  0.12086     1.16270006 -0.0053908  -0.66131997 

In [10]:
# Step 6: Prepare the search data
search_data = []
for word in vocab:
    search_data.append(embeddings_index[word])

search_data = np.array(search_data)


In [11]:
# Step 7: Define the search function
def search(query, top_k=5):
    top_words_list = []
    for query_word in query:
        query_tokens = query_word.split()
        query_embedding = np.mean([embeddings_index[token] for token in query_tokens if token in embeddings_index], axis=0)
        similarity_scores = cosine_similarity([query_embedding], search_data)
        similarity_scores = similarity_scores.reshape(-1)
        top_indices = similarity_scores.argsort()[-top_k:][::-1]
        top_words = [vocab[i] for i in top_indices]
        top_words_list.append(top_words)
    return top_words_list


In [12]:
stop_words = set(stopwords.words('english'))

# Words to remove
additional_keywords = ["caffe", "place", "coffee", "nan", "cafe"]

# Asking User Input
user_input = input("Search: ")

# Split the input into individual words
words = user_input.split()

# Preprocess the search query by splitting and removing symbols
search_keywords = re.findall(r'\b\w+\b', user_input.lower())

# Remove stop words and additional keywords from the search keywords
search_keywords = [word for word in search_keywords if word not in stop_words and word not in additional_keywords]

# Join the remaining words back together

print("Filtered Input:", search_keywords)

Search: caffe that has a wifi and beautiful view
Filtered Input: ['wifi', 'beautiful', 'view']


In [13]:
top_words_list = search(search_keywords, top_k=10)

list_of_words = []
for i, query_word in enumerate(search_keywords):
    print(f"Closest meanings for query '{query_word}':")
    print(top_words_list[i])
    list_of_words.append(top_words_list[i])
    print()
list_of_words  = [word for sublist in list_of_words for word in sublist]
# list_of_words = list_of_words[0:len(query)]
print('List closest keyword from query: \n', list_of_words)

Closest meanings for query 'wifi':
['wifi', 'wi-fi', 'bluetooth', 'connectivity', '802.11', 'wireless', 'wi', 'broadband', 'wimax', 'wlan']

Closest meanings for query 'beautiful':
['beautiful', 'lovely', 'gorgeous', 'wonderful', 'charming', 'magnificent', 'elegant', 'fabulous', 'splendid', 'perfect']

Closest meanings for query 'view':
['view', 'views', 'fact', 'approach', 'viewed', 'clear', 'see', 'notion', 'indeed', 'what']

List closest keyword from query: 
 ['wifi', 'wi-fi', 'bluetooth', 'connectivity', '802.11', 'wireless', 'wi', 'broadband', 'wimax', 'wlan', 'beautiful', 'lovely', 'gorgeous', 'wonderful', 'charming', 'magnificent', 'elegant', 'fabulous', 'splendid', 'perfect', 'view', 'views', 'fact', 'approach', 'viewed', 'clear', 'see', 'notion', 'indeed', 'what']


In [16]:
def search_keywords(csv_file, keywords, column):

  with open(csv_file, 'r') as f:
    reader = csv.reader(f)
    rows = []
    for row in reader:
      rows.append(row)

  row_numbers = []
  for row in rows:
    for keyword in keywords:
      if keyword in row[column]:
        row_numbers.append(rows.index(row))

  return row_numbers

if __name__ == '__main__':
  csv_file = '/content/drive/MyDrive/For Capstone/Collecting data/Place Detail (Scored + Keyword 1 & 2 Extracted  + Additional Feature (longlang, contact etc)).csv'
  keywords = list_of_words
  column = 13

  row_numbers = search_keywords(csv_file, keywords, column)
  unique_list = list(set(row_numbers))
  sorted_list = sorted(unique_list)
  Place_list= sorted_list[:20]
  print(Place_list)


[6, 10, 14, 17, 25, 37, 39, 53, 55, 57, 70, 71, 75, 76, 85, 87, 88, 93, 94, 96]


In [17]:
columns_to_extract = [0, 2, 4, 5]

with open(csv_file, 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row

    def get_data(row_numbers, column_numbers):
        data = []
        for i, row in enumerate(reader):
            if i + 1 in row_numbers:
                row_data = [row[col] for col in column_numbers]
                data.append(row_data)
        return data

    data = get_data(Place_list, columns_to_extract)
    for row in data:
        print(row)

['Loko Coffee Shop Malioboro Yogyakarta', 'Komp. Ps. Kembang, Jl. Ps. Kembang, Sosromenduran, Gedong Tengen, Kota Yogyakarta, Daerah Istimewa Yogyakarta 55271, Indonesia', '4.5', '6488']
['Pupuk Bawang Batu', 'Jl. Panglima Sudirman No.116, Pesanggrahan, Kec. Batu, Kota Batu, Jawa Timur 55313, Indonesia', '4.2', '6542']
['Baraka Coffee House', 'Jl. Watumujur II No.6, Ketawanggede, Kec. Lowokwaru, Kota Malang, Jawa Timur 65145, Indonesia', '5', '1273']
['Java Dancer Coffee - Jakarta', 'Jl. Jakarta No.59, Oro-oro Dowo, Kec. Klojen, Kota Malang, Jawa Timur 65112, Indonesia', '4.5', '4085']
['My Kopi O!', 'Jl. Tenes No.14, Kauman, Kec. Klojen, Kota Malang, Jawa Timur 65119, Indonesia', '4.6', '3146']
['Starbucks Coffee Malang City Point', 'Lt. GF, Malang City Point, Jl. Terusan Dieng No.31, Pisang Candi, Kec. Sukun, Kota Malang, Jawa Timur 65146, Indonesia', '4.6', '2460']
['Cokelat Klasik Cafe Batu', '4GP8+H8P, Pesanggrahan, Batu, Batu City, East Java 65313, Indonesia', '4.4', '3580']
['Ke

In [20]:
model.save('/content/drive/MyDrive/For Capstone/Tensorflow Words Embedding/Word_Embedding.h5')

In [21]:
# Load your trained TensorFlow model
model = tf.keras.models.load_model('/content/drive/MyDrive/For Capstone/Tensorflow Words Embedding/Word_Embedding.h5')  # Replace 'your_model.h5' with your model file

# Convert the model to JSON
model_json = model.to_json()

# Save the JSON model to a file
with open('/content/drive/MyDrive/For Capstone/Tensorflow Words Embedding/Word_Embedding.json', 'w') as json_file:
    json_file.write(model_json)

In [None]:
# !cp -r "/content/Word_Embedding.h5" "/content/drive/MyDrive/For Capstone"
#!cp -r "/content/Word_Embedding.json" "/content/drive/MyDrive/For Capstone"


In [22]:
export_dir = '/content/drive/MyDrive/For Capstone/Tensorflow Words Embedding'

# YOUR CODE HERE
tf.saved_model.save(model, export_dir = export_dir)



In [23]:
# Select mode of optimization
mode = "Speed" 

if mode == 'Storage':
    optimization = tf.lite.Optimize.OPTIMIZE_FOR_SIZE
elif mode == 'Speed':
    optimization = tf.lite.Optimize.OPTIMIZE_FOR_LATENCY
else:
    optimization = tf.lite.Optimize.DEFAULT

In [24]:
converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)

# Set the optimzations
converter.optimizations = [optimization]

# Invoke the converter to finally generate the TFLite model
tflite_model = converter.convert()



In [26]:
import pathlib

tflite_model_file = pathlib.Path('/content/drive/MyDrive/For Capstone/Tensorflow Words Embedding/model for Word Embedding + Searching with Cosine Similarity.tflite')
tflite_model_file.write_bytes(tflite_model)

40015920

In [None]:
#!cp -r "/content/model for Word Embedding + Searching with Cosine Similarity.tflite" "/content/drive/MyDrive/For Capstone"