## **BITS F407 - ARTIFICIAL INTELLIGENCE**
--------------------------------------------------------------------------------


**PROJECT TITLE: ENHANCING TEXT-PATTERN SEARCH IN SQL DATABASES**
--------------------------------------------------------------------------------

***Team number: 68***


---
**Full names of all students in the team: ANIRUDH BAGALKOTKER, SAKAR HIRDE**

---
**Id number of all students in the team: 2021A7PS2682H, 2021A3PS3203H**


## ***1. Import Dependencies***

In [None]:
%pip install pandas numpy matplotlib mysql-connector fuzzywuzzy scikit-learn python-Levenshtein

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mysql.connector
import time
import math
import Levenshtein

## ***2. Connect to MySQL Database***

In [None]:
mysqlConnector = mysql.connector.connect(
	host="localhost",
	user="root",
	password="root"
)

print(mysqlConnector)

mysqlCursor = mysqlConnector.cursor()

mysqlCursor.execute("SHOW DATABASES")

for database in mysqlCursor:
	print(database)

## ***3. Create Database***

In [None]:
mysqlCursor.execute("CREATE DATABASE IF NOT EXISTS AI_PROJ")

mysqlCursor.execute("USE AI_PROJ")

## ***4. Create Tables***

In [None]:
mysqlCursor.execute("CREATE TABLE IF NOT EXISTS `BOOKS` ( `BID` int(10) UNSIGNED NOT NULL, `ISBN` bigint(13) UNSIGNED DEFAULT NULL, `NAME` varchar(200) COLLATE utf8_unicode_ci NOT NULL, `MRP` int(6) UNSIGNED DEFAULT NULL, `DESCRIPTION` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL, `IMG` varchar(3000) COLLATE utf8_unicode_ci DEFAULT NULL, `AUTHOR` varchar(100) COLLATE utf8_unicode_ci DEFAULT NULL, `FORMAT` varchar(25) COLLATE utf8_unicode_ci DEFAULT 'PAPERBACK' COMMENT 'PAPERBACK OR HARDCOVER', `PAGES` smallint(4) DEFAULT NULL, `WEIGHT` smallint(5) DEFAULT NULL, `REVIEW` tinyint(1) UNSIGNED NOT NULL DEFAULT '0') ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;")

mysqlCursor.execute("CREATE TABLE IF NOT EXISTS `INVENTORY` ( `PID` bigint(20) UNSIGNED NOT NULL, `BID` int(10) UNSIGNED NOT NULL, `SID` int(10) UNSIGNED NOT NULL, `COND` varchar(25) COLLATE utf8_unicode_ci NOT NULL DEFAULT 'NEW', `QTY` smallint(5) UNSIGNED NOT NULL DEFAULT '1', `CP` float(7,2) UNSIGNED NOT NULL, `SP` float(7,2) UNSIGNED NOT NULL, `DISCOUNT` float(3,1) UNSIGNED DEFAULT '0.0', `LANG` varchar(25) COLLATE utf8_unicode_ci DEFAULT 'ENGLISH', `PI` int(10) UNSIGNED NOT NULL DEFAULT '0', `CREATED` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, `MODIFIED` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;")

We have Already Added the Tables through the MySQL Workbench (Using SQL) and we have generated some data and filled the data in Tables

## ***5. SQL In-Built Functions Query***

There are 4 SQL In-Built Functions Queries

1. **LIKE**
2. **SIMILAR TO**
3. **CONTAINS**
4. **MATCH**

Lets Apply LIKE and MATCH Queries and check the time taken for executing each of the query

In [None]:
timeArr = []

In [None]:
def MatchQuery(searchString):
    start_time = time.time()
    sql = """
    SELECT DISTINCT 
        I.PID, I.BID, I.SID, I.COND, I.QTY, I.CP, I.SP, I.DISCOUNT, I.LANG, B.ISBN, B.NAME, 
        B.MRP, B.DESCRIPTION, B.IMG, B.AUTHOR, B.FORMAT, B.PAGES, B.WEIGHT, B.REVIEW, 
    MATCH(B.NAME, B.AUTHOR, B.DESCRIPTION) AGAINST (%s IN BOOLEAN MODE) 
    AS relevance FROM INVENTORY I JOIN BOOKS B ON I.BID = B.BID 
    WHERE MATCH(B.NAME, B.AUTHOR, B.DESCRIPTION) AGAINST (%s IN BOOLEAN MODE) 
    AND (B.NAME LIKE %s OR B.AUTHOR LIKE %s) ORDER BY relevance DESC 
    """
    mysqlCursor.execute(sql, (searchString, searchString, f"%{searchString}%", f"%{searchString}%"))
    for result in mysqlCursor:
        print(result)
    
    time.sleep(5)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken for MatchQuery: {elapsed_time} seconds")
    timeArr.append(elapsed_time)

def LikeQuery(searchString):
    start_time = time.time()
    sql = """
        SELECT DISTINCT I.PID, I.BID, I.SID, I.COND, I.QTY, I.CP, I.SP, I.DISCOUNT, I.LANG, 
            B.ISBN, B.NAME, B.MRP, B.DESCRIPTION, B.IMG, B.AUTHOR, B.FORMAT, 
            B.PAGES, B.WEIGHT, B.REVIEW
        FROM INVENTORY I
        JOIN BOOKS B ON I.BID = B.BID
        WHERE (B.NAME LIKE %s OR B.AUTHOR LIKE %s OR B.DESCRIPTION LIKE %s)
    """
    mysqlCursor.execute(sql, (f"%{searchString}%", f"%{searchString}%", f"%{searchString}%"))

    for result in mysqlCursor:
        print(result)
    
    time.sleep(5)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken for MatchQuery: {elapsed_time} seconds")
    timeArr.append(elapsed_time)

search = "Rich"
MatchQuery(search)
LikeQuery(search)

## ***6. AI Search Algorithms***

#### **6.1 Importing the Tables into a DataFrame**

In [None]:
mysqlCursor.execute("SELECT * FROM BOOKS")
books_data = mysqlCursor.fetchall()

# Create a pandas dataFrame for the 'BOOKS' table
books_df = pd.DataFrame(books_data, columns=["BID", "ISBN", "NAME", "MRP", "DESCRIPTION", "IMG", "AUTHOR", "FORMAT", "PAGES", "WEIGHT", "REVIEW"])

# Display the 'BOOKS' dataFrame
print("BOOKS Table:")
print(books_df)

# Execute a SELECT query on the 'INVENTORY' table
mysqlCursor.execute("SELECT * FROM INVENTORY")
inventory_data = mysqlCursor.fetchall()

# Create a pandas dataFrame for the 'INVENTORY' table
inventory_df = pd.DataFrame(inventory_data, columns=["PID", "BID", "SID", "COND", "QTY", "CP", "SP", "DISCOUNT", "LANG", "PI", "CREATED", "MODIFIED"])

# Display the 'INVENTORY' dataFrame
print("\nINVENTORY Table:")
print(inventory_df)

#### **6.2 Full-Text Search**

This core search method enables quick and effective text-based searches. We Have Used RE library to Support regular expressions (RE). Regular expressions can contain both special and ordinary characters. Most ordinary characters, like "A", "a", or "0", are the simplest regular expressions, they simply match themselves.

In [None]:
import re

search_term = "Rich"

# Filtered dataframes
filtered_inventory_df = pd.DataFrame()
filtered_books_df = pd.DataFrame()

for col in ['COND', 'LANG']:
    mask = inventory_df[col].str.contains(re.escape(search_term), case=False) & inventory_df[col].notna()
    filtered_inventory_df = pd.concat([filtered_inventory_df, inventory_df[mask]])

start_time = time.time()
for col in ['NAME', 'AUTHOR', 'DESCRIPTION']:
    mask = books_df[col].str.contains(re.escape(search_term), case=False) & books_df[col].notna()
    filtered_books_df = pd.concat([filtered_books_df, books_df[mask]])

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken for MatchQuery: {elapsed_time} seconds")

print("Filtered INVENTORY Table:")
print(filtered_inventory_df)

print("\nFiltered BOOKS Table:")
print(filtered_books_df)

#### **6.3 Vector Space Model (TF-IDF):**

TF-IDF is a statistical tool for assessing a word's significance in relation to a group of documents. It is frequently applied to text-based searches.

In [None]:
search_string = "Rich"

# Function to calculate Levenshtein distance ratio
def levenshtein_ratio(s1, s2):
    distance = Levenshtein.distance(s1, s2)
    return 1 - (distance / max(len(s1), len(s2)))

# Fuzzy Search using Levenshtein distance
fuzzy_results = books_df[books_df.apply(lambda row: any(levenshtein_ratio(search_string.lower(), str(cell).lower()) > 0.8 for cell in row), axis=1)]

# Display the results
print("\nFuzzy Search Results:")
print(fuzzy_results)

In [None]:
# Function to calculate TF-IDF
def tfidf(term, document, corpus):
    tf = document.lower().count(term.lower()) / len(document.split())
    idf = np.log(len(corpus) / (1 + sum(term.lower() in doc.lower() for doc in corpus)))
    return tf * idf

# Create a corpus from the specified columns
corpus = books_df[['NAME', 'AUTHOR', 'DESCRIPTION']].fillna('').apply(lambda row: ' '.join(row), axis=1)

# Calculate TF-IDF for each document in the corpus
tfidf_matrix = np.array([[tfidf(term, doc, corpus) for term in search_string.split()] for doc in corpus])

# Calculate the cosine similarity
cosine_similarities = np.dot(tfidf_matrix, tfidf_matrix.T) / (np.linalg.norm(tfidf_matrix, axis=1)[:, None] * np.linalg.norm(tfidf_matrix, axis=1))

# Get the indices of documents with similarity above a threshold
similarity_threshold = 0.1  # Adjust the threshold as needed
similar_indices = np.where(cosine_similarities > similarity_threshold)

# Get the results dataframe
vsm_results = books_df.iloc[similar_indices[0]]

# Display the results
print("\nVector Space Model (TF-IDF) Results:")
print(vsm_results)

#### **6.4 BM25:**

BM25 is an enhanced version of TF-IDF that takes into account things like term
saturation and normalization of document length. It is well known for being efficient at retrieving information.

In [None]:
def bm25(term, document, documents, k=1.5, b=0.75):
    tf = document.count(term)
    idf = math.log((len(documents) - sum([1 for doc in documents if term in doc]) + 0.5) / (sum([1 for doc in documents if term in doc]) + 0.5) + 1)
    avgdl = sum(len(doc) for doc in documents) / len(documents)
    score = idf * (tf * (k + 1)) / (tf + k * (1 - b + b * len(document) / avgdl))
    return score

# Example Usage
search_term = 'Rich'

# Apply BM25 to relevant columns in inventory_df
start_time = time.time()
for col in ['COND', 'LANG']:
    documents = inventory_df[col].apply(lambda x: str(x).split() if pd.notnull(x) else []).tolist()
    scores = [bm25(search_term, doc, documents) for doc in documents]
    inventory_df[f'{col}_bm25_score'] = scores

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken for MatchQuery: {elapsed_time} seconds")

start_time = time.time()
# Apply BM25 to relevant columns in books_df
for col in ['NAME', 'AUTHOR', 'DESCRIPTION']:
    documents = books_df[col].apply(lambda x: str(x).split() if pd.notnull(x) else []).tolist()
    scores = [bm25(search_term, doc, documents) for doc in documents]
    books_df[f'{col}_bm25_score'] = scores

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken for MatchQuery: {elapsed_time} seconds")
timeArr.append(elapsed_time)

# Display the dataframes with BM25 scores
print("INVENTORY Table with BM25 scores:")
print(inventory_df)

print("\nBOOKS Table with BM25 scores:")
print(books_df)

In [None]:
import pandas as pd
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Sample data retrieval
# Assuming you have already created 'books_df' and 'inventory_df' dataframes

# Example search string
search_string = "example"

# Full-Text Search
full_text_results = books_df[books_df.apply(lambda row: any(search_string.lower() in str(cell).lower() for cell in row), axis=1)]

# Fuzzy Search using Levenshtein distance
fuzzy_results = books_df[books_df.apply(lambda row: any(fuzz.ratio(search_string.lower(), str(cell).lower()) > 80 for cell in row), axis=1)]

# Vector Space Model (VSM) using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(books_df[['NAME', 'AUTHOR', 'DESCRIPTION']].fillna(''))

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_vectorizer.transform([search_string]))
vsm_results = books_df.iloc[cosine_similarities[0].argsort()[::-1]]

# Boolean Search
boolean_results = books_df[books_df.apply(lambda row: any(search_string.lower() in str(cell).lower() for cell in row), axis=1)]

# Display the results
print("Full-Text Search Results:")
print(full_text_results)

print("\nFuzzy Search Results:")
print(fuzzy_results)

print("\nVector Space Model (TF-IDF) Results:")
print(vsm_results)

print("\nBoolean Search Results:")
print(boolean_results)
