In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from scipy.sparse import csr_matrix
from keras.models import load_model
from tensorflow.keras.models import Model
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial import distance

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def read_files():
  docs=[]
  docs_names=[]

  path="/content/drive/MyDrive/FYP/Text_Files"
  dir_list = os.listdir(path)
  #print(dir_list)

  for f in dir_list:
    with open(path+"/"+f, "r", encoding='Windows-1252') as file:
      data = file.read()
    data = data.replace("\n"," ")
    docs.append(data)
    docs_names.append(f)

  return docs, docs_names

In [None]:
def tfidf_vectorization(docs):
  vectorizer = TfidfVectorizer()
  X  = vectorizer.fit_transform(docs)
  return vectorizer, X

In [None]:
def load_dataframe():
  df = pd.read_csv ('/content/drive/MyDrive/FYP/dataset.csv')
  return df

In [None]:
def encoding(encoder, Z):
  compress = []
  
  for i in range(len(Z)) :
    a = (encoder.predict(Z[i]).flatten())
    compress.append(a)
  
  return compress

In [None]:
def getQueryDoc(file):
  with open(file, "r", encoding='Windows-1252') as file:
    d = file.read()
  d = d.replace("\n"," ")
  return d

In [None]:
def main(query):
  docs, names = read_files()
  vectorizer, tfidf = tfidf_vectorization(docs)

  df = load_dataframe()
  #print(df)

  # reconstruct dense matrix
  S = csr_matrix(tfidf)
  Z = S.todense()
  
  model = load_model('/content/drive/MyDrive/FYP/saved_model/autoencoder.h5')

  #extracting encoder part
  encoder = Model(inputs=model.input, outputs=model.layers[-9].output)

  compress = []
  
  for i in range(len(Z)) :
    a = (encoder.predict(Z[i]).flatten())
    compress.append(a)

  doc1 = []
  for i in range(len(df)) :
    a = names.index(df.iloc[i,0])
    doc1.append(compress[a])

  df['doc1'] = doc1

  doc2 = []
  for i in range(len(df)) :
    a = names.index(df.iloc[i,1])
    doc2.append(compress[a])

  df['doc2'] = doc2

  #print(df)
  
  # Fitting K-Means to the dataset
  kmeans = KMeans(n_clusters = 5, init = 'k-means++')
  y_kmeans = kmeans.fit_predict(compress)

  query_doc = getQueryDoc(query)

  d = [query_doc]
  dv = vectorizer.transform(d)  

  q = csr_matrix(dv)
  # reconstruct dense matrix
  qq = q.todense()

  query = encoder.predict(qq)
  query_doc = query.reshape(3930,)

  centroid = kmeans.cluster_centers_

  euc_dis = []
  for i in centroid:
    dis = distance.euclidean(query_doc, i)
    euc_dis.append(dis)

  k = euc_dis.index(max(euc_dis))
  
  query_doc = query_doc.reshape(1, 3930)

  # DataFrame 
  df1 = pd.DataFrame(names, columns =['Document_Name'])  
  df1['Document']= docs

  siamese_model = load_model('/content/drive/MyDrive/FYP/saved_model/siamesemodel.h5')
  #print(siamese_model.summary())

  for i in range(len(compress)):
    compress[i]= compress[i].reshape(1,3930)

  df1['Cluster_id']=y_kmeans
  df1['Embedding']=compress
  #print(df1)

  similarity_index = []
  source_doc_name=[]
  source_doc=[]
  for i in range(len(df1)):
    if(df1['Cluster_id'][i])==k:
      similarity_index.append(siamese_model.predict([query_doc, df1['Embedding'][i]]))
      source_doc_name.append(df1['Document_Name'][i])
      source_doc.append(df1['Document'][i])

  similarity_index = np.array(similarity_index)
  similarity_index = similarity_index.flatten()

  max_ind = np.argsort(similarity_index)[::-1][:10]
  Document_Names=[source_doc_name[i] for i in max_ind]
  Document= [source_doc[i] for i in max_ind]
  Similarity_Index= similarity_index[max_ind]

  df2 = pd.DataFrame(Document_Names, columns =['Document_Name'])  
  df2['Data'] = Document
  df2['Similarity_Index'] = Similarity_Index
  
  print(df2)
  return Document_Names, Document, Similarity_Index

In [None]:
Document_Names, Document, Similarity_Index = main('/content/drive/MyDrive/Dataset/D13-1047.pdf.txt')

      Document_Name                                               Data  \
0  D13-1055.pdf.txt  Proceedings of the 2013 Conference on Empirica...   
1  D13-1084.pdf.txt  Proceedings of the 2013 Conference on Empirica...   
2  D12-1066.pdf.txt  Proceedings of the 2012 Joint Conference on Em...   
3  D13-1041.pdf.txt  Proceedings of the 2013 Conference on Empirica...   
4  D12-1082.pdf.txt  Proceedings of the 2012 Joint Conference on Em...   
5  D11-1141.pdf.txt  Proceedings of the 2011 Conference on Empirica...   
6  D11-1092.pdf.txt  Proceedings of the 2011 Conference on Empirica...   
7  D09-1130.pdf.txt  Proceedings of the 2009 Conference on Empirica...   
8  D08-1072.pdf.txt  Proceedings of the 2008 Conference on Empirica...   
9  A88-1000.pdf.txt  Second  Conference   on  Applied  Natural Lang...   

   Similarity_Index  
0          0.293023  
1          0.292493  
2          0.280775  
3          0.279848  
4          0.279286  
5          0.274969  
6          0.274137  
7        