## **BITS F407 - ARTIFICIAL INTELLIGENCE**
--------------------------------------------------------------------------------


**PROJECT TITLE: ENHANCING TEXT-PATTERN SEARCH IN SQL DATABASES**
--------------------------------------------------------------------------------

***Team number: 68***


---
**Full names of all students in the team: ANIRUDH BAGALKOTKER, SAKAR HIRDE**

---
**Id number of all students in the team: 2021A7PS2682H, 2021A3PS3203H**


## ***1. Import Dependencies***

In [None]:
%pip install pandas numpy matplotlib mysql-connector scikit-learn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mysql.connector
import time
import math

## ***2. Connect to MySQL Database***

In [None]:
mysqlConnector = mysql.connector.connect(
	host="localhost",
	user="root",
	password="root"
)

print(mysqlConnector)

mysqlCursor = mysqlConnector.cursor()

mysqlCursor.execute("SHOW DATABASES")

for database in mysqlCursor:
	print(database)

## ***3. Create Database***

In [None]:
mysqlCursor.execute("CREATE DATABASE IF NOT EXISTS AI_PROJ")

mysqlCursor.execute("USE AI_PROJ")

## ***4. Create Tables***

In [None]:
mysqlCursor.execute("CREATE TABLE IF NOT EXISTS `BOOKS` ( `BID` int(10) UNSIGNED NOT NULL, `ISBN` bigint(13) UNSIGNED DEFAULT NULL, `NAME` varchar(200) COLLATE utf8_unicode_ci NOT NULL, `MRP` int(6) UNSIGNED DEFAULT NULL, `DESCRIPTION` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL, `IMG` varchar(3000) COLLATE utf8_unicode_ci DEFAULT NULL, `AUTHOR` varchar(100) COLLATE utf8_unicode_ci DEFAULT NULL, `FORMAT` varchar(25) COLLATE utf8_unicode_ci DEFAULT 'PAPERBACK' COMMENT 'PAPERBACK OR HARDCOVER', `PAGES` smallint(4) DEFAULT NULL, `WEIGHT` smallint(5) DEFAULT NULL, `REVIEW` tinyint(1) UNSIGNED NOT NULL DEFAULT '0') ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;")

mysqlCursor.execute("CREATE TABLE IF NOT EXISTS `INVENTORY` ( `PID` bigint(20) UNSIGNED NOT NULL, `BID` int(10) UNSIGNED NOT NULL, `SID` int(10) UNSIGNED NOT NULL, `COND` varchar(25) COLLATE utf8_unicode_ci NOT NULL DEFAULT 'NEW', `QTY` smallint(5) UNSIGNED NOT NULL DEFAULT '1', `CP` float(7,2) UNSIGNED NOT NULL, `SP` float(7,2) UNSIGNED NOT NULL, `DISCOUNT` float(3,1) UNSIGNED DEFAULT '0.0', `LANG` varchar(25) COLLATE utf8_unicode_ci DEFAULT 'ENGLISH', `PI` int(10) UNSIGNED NOT NULL DEFAULT '0', `CREATED` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, `MODIFIED` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;")

We have Already Added the Tables through the MySQL Workbench (Using SQL) and we have generated some data and filled the data in Tables

## ***5. SQL In-Built Functions Query***

There are 4 SQL In-Built Functions Queries

1. **LIKE**
2. **SIMILAR TO**
3. **CONTAINS**
4. **MATCH**

Lets Apply LIKE and MATCH Queries and check the time taken for executing each of the query

In [None]:
timeArr = []

In [None]:
def MatchQuery(searchString):
    start_time = time.time()
    sql = """
    SELECT DISTINCT 
        I.PID, I.BID, I.SID, I.COND, I.QTY, I.CP, I.SP, I.DISCOUNT, I.LANG, B.ISBN, B.NAME, 
        B.MRP, B.DESCRIPTION, B.IMG, B.AUTHOR, B.FORMAT, B.PAGES, B.WEIGHT, B.REVIEW, 
    MATCH(B.NAME, B.AUTHOR, B.DESCRIPTION) AGAINST (%s IN BOOLEAN MODE) 
    AS relevance FROM INVENTORY I JOIN BOOKS B ON I.BID = B.BID 
    WHERE MATCH(B.NAME, B.AUTHOR, B.DESCRIPTION) AGAINST (%s IN BOOLEAN MODE) 
    AND (B.NAME LIKE %s OR B.AUTHOR LIKE %s) ORDER BY relevance DESC 
    """
    mysqlCursor.execute(sql, (searchString, searchString, f"%{searchString}%", f"%{searchString}%"))
    for result in mysqlCursor:
        print(result)
    
    time.sleep(5)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken for MatchQuery: {elapsed_time} seconds")
    timeArr.append(elapsed_time)

def LikeQuery(searchString):
    start_time = time.time()
    sql = """
        SELECT DISTINCT I.PID, I.BID, I.SID, I.COND, I.QTY, I.CP, I.SP, I.DISCOUNT, I.LANG, 
            B.ISBN, B.NAME, B.MRP, B.DESCRIPTION, B.IMG, B.AUTHOR, B.FORMAT, 
            B.PAGES, B.WEIGHT, B.REVIEW
        FROM INVENTORY I
        JOIN BOOKS B ON I.BID = B.BID
        WHERE (B.NAME LIKE %s OR B.AUTHOR LIKE %s OR B.DESCRIPTION LIKE %s)
    """
    mysqlCursor.execute(sql, (f"%{searchString}%", f"%{searchString}%", f"%{searchString}%"))

    for result in mysqlCursor:
        print(result)
    
    time.sleep(5)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken for MatchQuery: {elapsed_time} seconds")
    timeArr.append(elapsed_time)

search = "Rich"
MatchQuery(search)
LikeQuery(search)

## ***6. AI Search Algorithms***

#### **6.1 Importing the Tables into a DataFrame**

In [None]:
mysqlCursor.execute("SELECT * FROM BOOKS")
books_data = mysqlCursor.fetchall()

# Create a pandas dataFrame for the 'BOOKS' table
books_df = pd.DataFrame(books_data, columns=["BID", "ISBN", "NAME", "MRP", "DESCRIPTION", "IMG", "AUTHOR", "FORMAT", "PAGES", "WEIGHT", "REVIEW"])

# Display the 'BOOKS' dataFrame
print("BOOKS Table:")
print(books_df)

# Execute a SELECT query on the 'INVENTORY' table
mysqlCursor.execute("SELECT * FROM INVENTORY")
inventory_data = mysqlCursor.fetchall()

# Create a pandas dataFrame for the 'INVENTORY' table
inventory_df = pd.DataFrame(inventory_data, columns=["PID", "BID", "SID", "COND", "QTY", "CP", "SP", "DISCOUNT", "LANG", "PI", "CREATED", "MODIFIED"])

# Display the 'INVENTORY' dataFrame
print("\nINVENTORY Table:")
print(inventory_df)

#### **6.2 Full-Text Search**

This core search method enables quick and effective text-based searches. We Have Used RE library to Support regular expressions (RE). Regular expressions can contain both special and ordinary characters. Most ordinary characters, like "A", "a", or "0", are the simplest regular expressions, they simply match themselves.

In [None]:
import re

search_term = "Rich"

# Filtered dataframes
filtered_inventory_df = pd.DataFrame()
filtered_books_df = pd.DataFrame()

for col in ['COND', 'LANG']:
    mask = inventory_df[col].str.contains(re.escape(search_term), case=False) & inventory_df[col].notna()
    filtered_inventory_df = pd.concat([filtered_inventory_df, inventory_df[mask]])

start_time = time.time()
for col in ['NAME', 'AUTHOR', 'DESCRIPTION']:
    mask = books_df[col].str.contains(re.escape(search_term), case=False) & books_df[col].notna()
    filtered_books_df = pd.concat([filtered_books_df, books_df[mask]])

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken for MatchQuery: {elapsed_time} seconds")
timeArr.append(elapsed_time)

print("Filtered INVENTORY Table:")
print(filtered_inventory_df)

print("\nFiltered BOOKS Table:")
print(filtered_books_df)

#### **6.3 Vector Space Model (TF-IDF):**

TF-IDF is a statistical tool for assessing a word's significance in relation to a group of documents. It is frequently applied to text-based searches.

In [None]:
search_string = "Rich"

def calculate_tfidf(corpus, search_string):
    # Tokenize and create a vocabulary
    words = re.findall(r'\b\w+\b', ' '.join(corpus))
    vocabulary = list(set(words))

    # Create a matrix to store the term frequencies
    tf_matrix = np.zeros((len(corpus), len(vocabulary)))

    for i, text in enumerate(corpus):
        # Count term frequencies for each document
        words = re.findall(r'\b\w+\b', text)
        for word in words:
            tf_matrix[i, vocabulary.index(word)] += 1

    # Calculate IDF
    df_matrix = np.where(tf_matrix > 0, 1, 0)
    idf = np.log(len(corpus) / np.sum(df_matrix, axis=0))

    # Calculate TF-IDF
    tfidf_matrix = tf_matrix * idf

    # Calculate the TF-IDF vector for the search string
    search_tfidf = np.zeros(len(vocabulary))
    search_words = re.findall(r'\b\w+\b', search_string)
    for word in search_words:
        if word in vocabulary:
            search_tfidf[vocabulary.index(word)] += 1

    return tfidf_matrix, search_tfidf

# Calculate TF-IDF
start_time = time.time()

corpus = books_df['NAME'].fillna('') + ' ' + books_df['AUTHOR'].fillna('') + ' ' + books_df['DESCRIPTION'].fillna('')
tfidf_matrix, search_tfidf = calculate_tfidf(corpus, search_string)

# Calculate cosine similarity
cosine_similarities = np.dot(tfidf_matrix, search_tfidf) / (np.linalg.norm(tfidf_matrix, axis=1) * np.linalg.norm(search_tfidf))
vsm_results = books_df.iloc[cosine_similarities.argsort()[::-1]]

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken for MatchQuery: {elapsed_time} seconds")
timeArr.append(elapsed_time)

print("\nVector Space Model (TF-IDF) Results:")
print(vsm_results)

#### **6.4 BM25:**

BM25 is an enhanced version of TF-IDF that takes into account things like term
saturation and normalization of document length. It is well known for being efficient at retrieving information.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

search_string = "Rich"

# Function to calculate BM25 score
def calculate_bm25(corpus, search_string):
    start_time = time.time()
    
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()

    # Create the document-term matrix
    dtm = vectorizer.fit_transform(corpus)

    # Calculate TF-IDF matrix
    tfidf_matrix = transformer.fit_transform(dtm)

    # Convert the search string to a TF-IDF vector
    search_vector = transformer.transform(vectorizer.transform([search_string]))

    # Calculate BM25 score
    bm25_scores = np.sum(tfidf_matrix.multiply(search_vector), axis=1)

    # Add BM25 score as a new column in the dataframe
    books_df['BM25_Score'] = bm25_scores

    # Sort the dataframe based on BM25 score in descending order
    bm25_results = books_df.sort_values(by='BM25_Score', ascending=False)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken for MatchQuery: {elapsed_time} seconds")
    timeArr.append(elapsed_time)

    return bm25_results

# Prepare the corpus for BM25 calculation
corpus_for_bm25 = books_df['NAME'].fillna('') + ' ' + books_df['AUTHOR'].fillna('') + ' ' + books_df['DESCRIPTION'].fillna('')

# Calculate BM25 scores
bm25_results = calculate_bm25(corpus_for_bm25, search_string)

# Display the results
print("\nBM25 Search Results:")
print(bm25_results[['BID', 'NAME', 'AUTHOR', 'DESCRIPTION', 'BM25_Score']])

#### **6.4 TRIE:**

Trie structures can be utilised for prefix-based searches and autocomplete. For both storing and recovering words or phrases, they are effective.

In [None]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

def insert_word(root, word):
    node = root
    for char in word:
        if char not in node.children:
            node.children[char] = TrieNode()
        node = node.children[char]
    node.is_end_of_word = True

def search_trie(root, query):
    node = root
    results = []
    prefix = ""

    for char in query:
        if char in node.children:
            prefix += char
            node = node.children[char]
        else:
            break

    if node.is_end_of_word:
        results.append(prefix)

    stack = [(node, prefix)]

    while stack:
        current_node, current_prefix = stack.pop()

        for char, child_node in current_node.children.items():
            stack.append((child_node, current_prefix + char))
            if child_node.is_end_of_word:
                results.append(current_prefix + char)

    return results

# Example search string
search_string = "Rich"

# Create a Trie and insert words from the 'NAME', 'AUTHOR', and 'DESCRIPTION' columns
trie_root_name = TrieNode()
trie_root_author = TrieNode()
trie_root_description = TrieNode()

for name, author, description in zip(books_df['NAME'].fillna(''), books_df['AUTHOR'].fillna(''), books_df['DESCRIPTION'].fillna('')):
    insert_word(trie_root_name, name.lower())
    insert_word(trie_root_author, author.lower())
    insert_word(trie_root_description, description.lower())

# Search for words in the Trie based on the query
start_time = time.time()

trie_results_name = search_trie(trie_root_name, search_string.lower())
trie_results_author = search_trie(trie_root_author, search_string.lower())
trie_results_description = search_trie(trie_root_description, search_string.lower())

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken for MatchQuery: {elapsed_time} seconds")
timeArr.append(elapsed_time)

# Display the Trie search results
print("\nTrie Search Results for NAME:")
for result in trie_results_name:
    print(result)

print("\nTrie Search Results for AUTHOR:")
for result in trie_results_author:
    print(result)

print("\nTrie Search Results for DESCRIPTION:")
for result in trie_results_description:
    print(result)
