In [1]:
import time
import string
import re
import numpy as np
import pandas as pd
import json
import os
import math
import csv
import nltk
import pickle
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sys
from termcolor import colored, cprint

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

# Create functions for data cleaning

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
def clean(text):
    text = "".join([i.lower() for i in text if i not in string.punctuation])
    text = re.sub(r"([^-9A-Za-z ]|-)", " " , text)
    text = re.sub(r"\b\w{1,2}\b", "", text)
    text = re.sub(r"\s{2,}", " ", text)
    words = nltk.tokenize.word_tokenize(text)
    words_new = [i for i in words if i not in stopwords]
    text = " ".join(words_new)
    return text

# Load and clean data (or load cleaned data)

**Warning:** if running for the first time, first run [this notebook](https://colab.research.google.com/github/Eyon42/Mergianos/blob/main/Analysis.ipynb#scrollTo=MZpuhKUUFlM7) to fetch data

In [4]:
texts = {}

if os.path.exists("data/clean_data.json"):
    with open("data/clean_data.json") as f:
        texts = json.load(f)
else:
    filenames = os.listdir("data/Aerothermodynamics")
    for filename in filenames:
        with open(f"data/Aerothermodynamics/{filename}") as f:
            text = f.read()
            texts[filename.split(".")[0]] = clean(text)
    with open("data/clean_data.json", "w") as f:
        json.dump(texts, f)
        
with open ("data/Aerothermodynamics.json") as f:
    metadata = json.load(f)

# Extract Features

In [None]:
tfidf = TfidfVectorizer()
model = tfidf.fit_transform(texts.values())

words = list(tfidf.get_feature_names())
documents = list(texts.keys())

tfidf_df = pd.DataFrame.sparse.from_spmatrix(model)

# Create search

In [None]:
def search_for_word(word, nresults=None):
    try: 
        wordId = words.index(word)
    except:
        return []
    
    results = tfidf_df.iloc[:,wordId].sort_values(ascending=False)
    if nresults:
        results = results.head(nresults)
    return [(documents[i], s) for i, s in zip(results.index, results.values)]

def search(query):
    results = {}
    query = clean(query)
    for word in query.split(" "):
        word_results = search_for_word(word)
        for result in word_results:
            try:
                results[result[0]] = (results[result[0]] + result[1]) * 10
            except KeyError:
                results[result[0]] = result[1]

    return results

def display(data):
    sortedData = {key: val for key, val in sorted(data.items(), key = lambda ele: ele[1], reverse=True)}
    max_v = list(sortedData.values())[0]
    for k,v in data.items():
        print(f"Document ID: {k} - Search score: {v/max_v}\r", end="")
        time.sleep(0.002)
        if v == max_v:
            print(colored(f"Document ID: {k} - Search score: {v/max_v}{' '*10}\r", 'green', attrs=['reverse', 'blink']))
            print("Title: " + [i for i in metadata if i["id"]==int(k)][0]["title"])
            break

# Test Search

In [86]:
r = search("aerothermodynamics in mars entry")
display(r)

[0m[7m[32mDocument ID: 20040161501 - Search score: 1.0          
Title: Entry, Descent, and Landing: 2000-2004
