In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
import csv
import json
import flask_cors, flask
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS
from diffusers import AutoPipelineForText2Image
import torch
import re
import base64
from PIL import Image
import cloudinary
import cloudinary.uploader
from dotenv import load_dotenv
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Load environment variables from .env file
load_dotenv()

nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('stopwords')
# Initialize Word_Net_Lemmatizer
lemmatizer = WordNetLemmatizer()
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\haris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\haris\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
cloudinary.config(
    cloud_name=os.getenv("CLOUDINARY_CLOUD_NAME"),
    api_key=os.getenv("CLOUDINARY_API_KEY"),
    api_secret=os.getenv("CLOUDINARY_API_SECRET")
)

In [None]:
torch.cuda.empty_cache()
pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/sd-turbo", torch_dtype=torch.float16, variant="fp16")
pipe.to("cuda")

In [None]:
# Assuming this is your pipeline call
image = pipe(prompt="cat", num_inference_steps=1, guidance_scale=0.0).images[0]

# Check if the image is an instance of PIL's Image class
if isinstance(image, Image.Image):
    print("The output is a PIL image.")
else:
    print("The output is not a PIL image.")

In [2]:
def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [3]:
app = Flask(__name__)
CORS(app)

# Function that takes in content, preprocesses it, 
# and converts it to a list of words
def pre_process_string(content):
    # Remove \n and \t
    content = content.replace('\n', ' ')
    content = content.replace('\t', ' ')
    # Remove all non-characters
    content = re.sub(r'[^a-zA-Z\s]', ' ', content)
    # Remove multiple spaces
    content = re.sub(r'\s+', ' ', content)
    # Convert all characters to lowercase
    content = content.lower()
    # Convert the title into a list of words
    content = content.split()
    # Remove one and two character words
    content = [word for word in content if len(word) > 2]
    # Remove stop_words using nltk
    content = [word for word in content if not word in stopwords.words('english')]
    return content
    
# Function that takes in a list of words and adds them to the lexicon
def build_lexicon(words, lexicon):
    # Build the lexicon
    new_words = []
    # Look through the words
    for word in words:
        # Lemmatize the word
        word = lemmatizer.lemmatize(word)
        # if that word is not already in lexicon
        if word not in lexicon and word not in new_words:
            # Then add it
            new_words.append(word)
    lexicon.extend(new_words)
    return lexicon

def sort_lexicon():
    # Load the lexicon
    data = load_data_from_json(r"Files\lexicon.json")
    with open(r"Files\lexicon.json", "w") as file:
        # sort the data
        data = sorted(data)
        json.dump(data, file)

# Function to build forward index from raw articles
def build_forward_index(articles):
    # initialize forward_index
    forward_index = dict()

    #initialize documents
    docs = dict()

    # Load the already existing forward_index
    try:
        data = load_data_from_json(r"Files\forward_index.json")
    except:
        with open(r"Files\forward_index.json", "w") as file:
            json.dump(dict(), file)
        data = load_data_from_json(r"Files\forward_index.json")
        
    # Load the lexicon
    try:
        lexicon = load_data_from_json(r"Files\lexicon.json")
    except:
        with open(r"Files\lexicon.json", "w") as file:
            json.dump(list(), file)
        lexicon = load_data_from_json(r"Files\lexicon.json")

    # Load the documents
    try:
        documents = load_data_from_json(r"Files\documents.json")
    except:
        with open(r"Files\documents.json", "w") as file:
            json.dump(dict(), file)
        documents = load_data_from_json(r"Files\documents.json")
        
    num_articles = len(documents)
    
    # Extract all urls currently indexed
    try:
        article_urls = [article['url'] for article in documents.values()]
    except:
        article_urls = []
        
    # For each article
    for article in articles:
        # if article is not already forward indexed
        if article['url'] not in article_urls:
            # Pre-process the title and content
            title_words = pre_process_string(article['title'])
            content_words = pre_process_string(article['content'])
            # Update the lexicon
            lexicon = build_lexicon(title_words + content_words, lexicon)
            # Lemmatize the words in content and title
            content_words = [lemmatizer.lemmatize(word) for word in content_words]
            title_words = [lemmatizer.lemmatize(word) for word in title_words]
            # Convert the words in title and content to their respective indexes
            content_ids = [lexicon.index(word) for word in content_words]
            title_ids = [lexicon.index(word) for word in title_words]
            # Count the frequencies of words
            frequency = Counter((title_ids * 10) + content_ids)
            forward_index[num_articles] = frequency
            docs[num_articles] = {'title': article['title'], 'url': article['url']}
            # Add the url to the article
            article_urls.append(article['url'])
            num_articles += 1
    data.update(forward_index)
    documents.update(docs)
    # Update the lexicon json file
    with open(r"Files\lexicon.json", "w") as file:
        json.dump(lexicon, file)
    # Update the forward_index json file
    with open(r"Files\forward_index.json", "w") as file:
        json.dump(data, file)
    # Update the documents json file
    with open(r"Files\documents.json", "w") as file:
        json.dump(documents, file)

def build_inverted_index_with_barrels():
    # Load the forward index
    try:
        forward_index = load_data_from_json(r"Files\forward_index.json")
    except:
        with open(r"Files\forward_index.json", "w") as file:
            json.dump(dict(), file)
        forward_index = load_data_from_json(r"Files\forward_index.json")

    barrels = []
    barrel_files = os.listdir(r"Files\Barrels")
    # Load all barrels that currently exist
    for barrel in barrel_files:
        barrels.append(load_data_from_json(os.path.join(r"Files\Barrels", barrel)))

    # Iterate through all articles in the forward_index
    for doc_id, data in forward_index.items():
        # Look at all words in an article
        for word_id in data:
            # Calculate the barrel number for that word
            barrel_no = int(word_id) // 10000
            barrel_filename = f"barrel_{barrel_no}.json"
            
            # Check if that barrel exists, if not then create it
            barrel_path = os.path.join(r"Files\Barrels", barrel_filename)
            if not os.path.exists(barrel_path):
                with open(barrel_path, "w") as file:
                    json.dump(dict(), file)
                # Load the newly created barrel
                barrels.append(load_data_from_json(barrel_path))
                barrel_files.append(barrel_filename)
            # update the word_id
            word_id_new = int(word_id) % 10000
            # If that word is not already in that barrel
            if word_id_new not in barrels[barrel_no]:
                # Then create a dict at that word_id
                barrels[barrel_no][word_id_new] = dict()
            # And add the doc_id for that word along with frequency if it is not already there
            if doc_id not in barrels[barrel_no][word_id_new]:
                barrels[barrel_no][word_id_new].update({doc_id: data[word_id]})

    # Update all barrels
    i = 0
    for barrel in barrel_files:
        with open(os.path.join(r"Files\Barrels", barrel), "w") as file:
            json.dump(barrels[i], file)
            i += 1


def rank_results(search_result): 
     # Rank these documents
    # Sort the dictionary by values (descending order)
    sorted_tuples = sorted(search_result.items(), key=lambda x: x[1], reverse=True)
    
    # Convert the sorted list of tuples back to a dictionary
    ranked_result = dict(sorted_tuples)
    # Extract the article ids
    ranked_articles = ranked_result.keys()
    ranked_articles = list(ranked_articles)
    
    return ranked_articles

def add_content(data, new_article):
    article_id = str(len(data["index"]))
    data["index"][article_id] = len(data["index"])
    data["source"][article_id] = new_article[0]
    data["title"][article_id] = new_article[1]
    data["content"][article_id] = new_article[2]

    return data

In [None]:
# Build the data
json_file_path = r"Files\articles_sampled_10000.json"
json_data = load_data_from_json(json_file_path)

In [7]:
# Create a Files directory if it does not already exist
if "Files" not in os.listdir():
    os.mkdir("Files")
    os.mkdir(r"Files\Barrels")

In [8]:
%%time
build_forward_index(json_data)

NameError: name 'json_data' is not defined

In [4]:
%%time
build_inverted_index_with_barrels()

CPU times: total: 4min 12s
Wall time: 14min 10s


In [4]:
barrels = []
barrel_files = os.listdir(r"Files\Barrels")
# Load all barrels that currently exist
for barrel in barrel_files:
    barrels.append(load_data_from_json(os.path.join(r"Files\Barrels", barrel)))
    
# Load lexicon
lexicon = load_data_from_json(r"Files\lexicon.json")
# Load the documents
documents = load_data_from_json(r"Files\documents.json")

In [5]:
# Function for single word queries
def single_word_search(word):

    # Lemmatize the word
    word = lemmatizer.lemmatize(word)
        
    # Find the id of the word in lexicon
    try:
        word_id = lexicon.index(word)
        # Calculate the barrel of the word
        barrel_no = word_id // 10000
        # Update the word_id
        word_id = word_id % 10000
        # Find out in which documents does the word appear
        search_result = barrels[barrel_no][str(word_id)]
    except:
        search_result = None
    
    if search_result is None: 
        return []
        
    # Rank these documents
    # Sort the dictionary by values (descending order)
    sorted_tuples = sorted(search_result.items(), key=lambda x: x[1], reverse=True)
    
    # Convert the sorted list of tuples back to a dictionary
    ranked_result = dict(sorted_tuples)
    # Extract the article ids
    ranked_articles = ranked_result.keys()
    ranked_articles = list(ranked_articles)

    article_ids = ranked_articles
    titles = [documents[article]['title'] for article in article_ids]
    urls = [documents[article]['url'] for article in article_ids]

    return titles

In [6]:
@app.route("/search_1", methods=["GET"], endpoint='single_word_search')
def single_word_search():
    word = request.args.get('word')

    # Lemmatize the word
    word = lemmatizer.lemmatize(word)
    print(word)
        
    # Find the id of the word in lexicon
    try:
        word_id = lexicon.index(word)
        # Calculate the barrel of the word
        barrel_no = word_id // 10000
        # Update the word_id
        word_id = word_id % 10000
        # Find out in which documents does the word appear
        search_result = barrels[barrel_no][str(word_id)]
    except:
        search_result = None
    
    if search_result is None: 
        return jsonify(article_ids=[], titles=[], urls=[])

    # Rank documents based on frequency
    ranked_articles = rank_results(search_result)
    print(rank_results)

    article_ids = ranked_articles
    titles = [documents[article]['title'] for article in article_ids]
    urls = [documents[article]['url'] for article in article_ids]

    print(titles)
    print(urls)
    
    json_response = jsonify(article_ids=article_ids, titles=titles, urls=urls)

    return json_response

@app.route("/search_2", methods=["GET"], endpoint='multi_word_search')
def multi_word_search(): 
    query = request.args.get('word')
    result = []
    words = query.split()

    if words:
        result = inverted_index.search(words[0]) if inverted_index.search(words[0]) else []
        for word in words[1:]:
            current_result = inverted_index.search(word)
            if current_result:
                result = [d for d in result if d in current_result]

    ranked_results = rank_results(remove_duplicates(result, key="article_id"))

    article_ids = [result['article_id'] for result in ranked_results]
    titles = [result['title'] for result in ranked_results]
    urls = [result['url'] for result in ranked_results]

    json_response = jsonify(article_ids=article_ids, titles=titles, urls=urls)

    return json_response 


@app.route("/gen", methods=["GET"], endpoint='genai_tool')
def genai_tool():
    word = request.args.get('word') 
    image = pipe(prompt=word, num_inference_steps=1, guidance_scale=0.0).images[0]
    image_bytes = image.tobytes()
    image_base64 = base64.b64encode(image.tobytes()).decode('utf-8') 

    upload_result = cloudinary.uploader.upload(image_bytes, resource_type="raw") 
    cloudinary_url = upload_result.get("url")

    json_response = {
        'word': word,
        'image': image_base64,
        'image_cloudinary_url': cloudinary_url
    }

    return jsonify(json_response)


@app.route("/add", methods=["GET"])
def add_content(data, new_article):
    article_id = str(len(data["index"]))
    data["index"][article_id] = len(data["index"])
    data["source"][article_id] = new_article[0]
    data["title"][article_id] = new_article[1]
    data["content"][article_id] = new_article[2]

    return data

if __name__ == "__main__":
    app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [12/Dec/2023 23:16:17] "GET /search_1?word=car HTTP/1.1" 200 -


car
<function rank_results at 0x00000229F79B6F20>


127.0.0.1 - - [12/Dec/2023 23:24:48] "GET /search_1?word=ronaldo HTTP/1.1" 200 -


ronaldo
<function rank_results at 0x00000229F79B6F20>
['Ronaldo: The road to redemption with Brazil at the 2002 World Cup', 'Cristiano Ronaldo: Where could Manchester United striker go this summer?', 'Ronaldo’s World Cup: Sidelined then eclipsed by stand-in Ramos', 'Cristiano Ronaldo: What next for the Manchester United star?', 'World Cup 2022: Why Cristiano Ronaldo still has Portugal role as new stars emerge', "Cristiano Ronaldo: Will forward and Manchester United agree an exit strategy with 'the end in sight'?", 'Cristiano Ronaldo & Man Utd - what comes next for the Portugal forward?', 'Can Ronaldo rediscover his purpose after World Cup exit?', 'Portugal’s Ronaldo says his World Cup dream has ‘ended’', "Cristiano Ronaldo: Is Manchester United forward's time up at Old Trafford?", "Cristiano Ronaldo: Manchester United forward says he feels 'betrayed' by club", "Cristiano Ronaldo: Manchester Utd explore legal action to force player's exit", "Cristiano Ronaldo leaves Manchester United: R

[2023-12-12 23:25:05,381] ERROR in app: Exception on /search_2 [GET]
Traceback (most recent call last):
  File "c:\Users\haris\miniconda3\envs\forPyTorch\Lib\site-packages\flask\app.py", line 1455, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\haris\miniconda3\envs\forPyTorch\Lib\site-packages\flask\app.py", line 869, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\haris\miniconda3\envs\forPyTorch\Lib\site-packages\flask_cors\extension.py", line 176, in wrapped_function
    return cors_after_request(app.make_response(f(*args, **kwargs)))
                                                ^^^^^^^^^^^^^^^^^^
  File "c:\Users\haris\miniconda3\envs\forPyTorch\Lib\site-packages\flask\app.py", line 867, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\haris\miniconda3\envs\forPyTorch\Lib\site-pac

for


127.0.0.1 - - [12/Dec/2023 23:26:59] "GET /search_1?word=car HTTP/1.1" 200 -


car
<function rank_results at 0x00000229F79B6F20>


127.0.0.1 - - [12/Dec/2023 23:31:28] "GET /search_1?word=coffee HTTP/1.1" 200 -


coffee
<function rank_results at 0x00000229F79B6F20>
['https://brightside.me/inspiration-shopping/they-say-its-cheaper-to-make-coffee-at-home-so-we-compared-these-5-coffee-machines-to-find-the-best-one-810524/', 'https://www.cbsnews.com/essentials/best-coffee-maker-deals/', 'https://althealthworks.com/product-review-a-line-of-delicious-mold-free-organic-coffees-from-purity-coffee/', 'https://www.cbsnews.com/essentials/new-keurig-k-cafe-smart-hands-on-review/', 'https://althealthworks.com/holistic-doctor-warns-the-vast-majority-of-all-coffee-is-contaminated-with-mold-do-this-one-thing-if-you-enjoy-coffee-and-value-your-health/', 'https://althealthworks.com/mitigate-the-acidic-negative-effects-of-coffee-by-adding-this-one-ingredient-to-your-next-cup-3/', 'https://althealthworks.com/do-coffee-and-caffeine-dehydrate-you-holistic-doctor-shares-his-strategy-for-coffee-dehydration-and-replacing-electrolytes/', 'https://www.cbsnews.com/essentials/amazon-prime-day-2022-coffee-maker-deals/', 'ht

127.0.0.1 - - [12/Dec/2023 23:32:52] "GET /search_1?word=car HTTP/1.1" 200 -


car
<function rank_results at 0x00000229F79B6F20>


127.0.0.1 - - [12/Dec/2023 23:33:02] "GET /search_1?word=ronaldo HTTP/1.1" 200 -


ronaldo
<function rank_results at 0x00000229F79B6F20>
['Ronaldo: The road to redemption with Brazil at the 2002 World Cup', 'Cristiano Ronaldo: Where could Manchester United striker go this summer?', 'Ronaldo’s World Cup: Sidelined then eclipsed by stand-in Ramos', 'Cristiano Ronaldo: What next for the Manchester United star?', 'World Cup 2022: Why Cristiano Ronaldo still has Portugal role as new stars emerge', "Cristiano Ronaldo: Will forward and Manchester United agree an exit strategy with 'the end in sight'?", 'Cristiano Ronaldo & Man Utd - what comes next for the Portugal forward?', 'Can Ronaldo rediscover his purpose after World Cup exit?', 'Portugal’s Ronaldo says his World Cup dream has ‘ended’', "Cristiano Ronaldo: Is Manchester United forward's time up at Old Trafford?", "Cristiano Ronaldo: Manchester United forward says he feels 'betrayed' by club", "Cristiano Ronaldo: Manchester Utd explore legal action to force player's exit", "Cristiano Ronaldo leaves Manchester United: R

127.0.0.1 - - [12/Dec/2023 23:33:23] "GET /search_1?word=car HTTP/1.1" 200 -


car
<function rank_results at 0x00000229F79B6F20>


127.0.0.1 - - [12/Dec/2023 23:33:42] "GET /search_1?word=car HTTP/1.1" 200 -


car
<function rank_results at 0x00000229F79B6F20>


127.0.0.1 - - [12/Dec/2023 23:33:44] "GET /search_1?word=car HTTP/1.1" 200 -


car
<function rank_results at 0x00000229F79B6F20>


127.0.0.1 - - [12/Dec/2023 23:33:44] "GET /search_1?word=car HTTP/1.1" 200 -


car
<function rank_results at 0x00000229F79B6F20>


In [None]:
image = pipe(prompt="dog", num_inference_steps=1, guidance_scale=0.0).images[0]
print(image)