In [32]:
import glob
import os
import string
from nltk.stem import PorterStemmer
import math 
import json
import numpy as np


Dictionary = {} # Global dictionary
DocVectors = {} # Global dictionary for Document Vectors

def load_data():
    flag = False
    if os.path.exists('Dict.json'):
        flag = True
        print("reading Dict")
        with open('Dict.json', 'r') as f:
            Dictionary.update(json.load(f))
    if os.path.exists('TFIDFVec.json'):
        flag = True
        print("reading TFIDFVec")
        with open('TFIDFVec.json', 'r') as f:
            DocVectors.update(json.load(f))
    return flag

def FileRead(): 
    Folder = 'ResearchPapers'
    Pattern = '*.txt' 
    FList = glob.glob(os.path.join(Folder, Pattern)) #Finding all Files in the given Folder 
    for Path in FList: 
        with open(Path, 'r') as file: 
            FileContents = file.read() #Reading File text
            FileContents = FileContents.lower()
            File_name = Path.strip("ResearchPapers\\.txt")
            FileContents = PunctuationRemove(FileContents)# Removing Punctuations
            FileContents = FileContents.split() # Tokenizing string
            Stemmer = PorterStemmer()
            FileStem = []
            #Applying Stemming to all the tokens
            for words in list(FileContents):
                FileStem.append(Stemmer.stem(words))
            File_name = int(File_name)
            Dictionary = DictionaryBuilder(FileStem,File_name)
            Dictionary = sorted(Dictionary.items()) # Sorting the Dictionary by tokens
            Dictionary = dict(Dictionary)
    with open('Dict.json', 'w') as f:
        json.dump(Dictionary, f)
    # Initializing all Document Vectors with 0 for every word
    for i in range(1,27):
        if (i == 4 or i==5 or i==6 or i==10 or i==19 or i==20):
            continue
        DocVectors[i] = [0] * len(Dictionary)
    return Dictionary

def PunctuationRemove(File):
    # Function to remove punctuation marks from text
    File = File.replace('-', ' ')  # Replacing hyphens with spaces
    File = File.translate(str.maketrans("", "", string.punctuation))
    return File

def DictionaryBuilder(File,File_Name):
    Stop = open(r'Stopword-List.txt', 'r')
    StopContents = Stop.read()
    StopContents = StopContents.split()
    for words in File: # Building Dictionary
        if(words not in StopContents):
            if(words not in Dictionary): # First time a word is added to Dictionary
                Dictionary[words] = {}
                Dictionary[words][File_Name] = 1 # Setting Term Frequency for the document to 1
            else:
                if(File_Name not in Dictionary[words]):
                    Dictionary[words][File_Name] = 1 # Setting Term Frequency for the document to 1
                else:
                    Dictionary[words][File_Name] += 1 # Incrementing Term Frequency
    return Dictionary   

def BuildDocumentVectors():
    for Index, Key in enumerate(Dictionary): # Traversing through words in Dictionary
        for DocKeys in DocVectors.keys(): # Traversing through all Documents
            if(DocKeys in Dictionary[Key]):
                DocFreq = len(Dictionary[Key]) 
                InvertedDocFreq = round(math.log(len(DocVectors) / DocFreq, 10),2) # Calculating Inverted Document Frequency
                TfIdf = InvertedDocFreq * Dictionary[Key][DocKeys] 
                DocVectors[DocKeys][Index] = TfIdf
    with open('TFIDFVec.json', 'w') as f:
        json.dump(DocVectors, f)

def QueryProcessor(Query):
    Query = Query.split()
    Query = QueryStemmer(Query)
    QueryVector = [0] * len(Dictionary) # Initializing Query Vector 
    QueryDict = {}
    for words in Query: # Building Dictionary for Query
        if(words not in QueryDict): # First time a word is added to Dictionary
            QueryDict[words] = 1
        else:
            QueryDict[words] += 1
    for Index, Key in enumerate(Dictionary): # Traversing Dictionary
            if(Key in QueryDict):
                DocFreq = len(Dictionary[Key])
                InvertedDocFreq = math.log(len(DocVectors) / DocFreq, 10)
                TfIdf = InvertedDocFreq * QueryDict[Key]
                QueryVector[Index] = TfIdf
    return QueryVector

def QueryStemmer(Query):
    StemQuery = []
    Stop = open(r'Stopword-List.txt', 'r')
    StopContents = Stop.read()
    StopContents = StopContents.split()
    Stemmer = PorterStemmer()
    Query = [Val for Val in Query if Val not in StopContents]
    for words in Query:
        StemQuery.append(Stemmer.stem(words))
    return StemQuery  

Dictionary = FileRead()
BuildDocumentVectors()

# def search_query():
#     query = input("Enter your query: ")
#     query_vector = QueryProcessor(query)
#     results = Solver(query_vector)

#     if not results:
#         print("No documents match the query.")
#     else:
#         print("Search Results sorted by cosine similarity:")
#         for result in results:
#             print(f"Document ID: {result[0]}, Cosine Similarity: {result[1]}")

# # Call the search_query function to start searching
# search_query()

import json
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

doc_classes = {
    '1': "Explainable Artificial Intelligence",
    '2': "Explainable Artificial Intelligence",
    '3': "Explainable Artificial Intelligence",
    '7': "Explainable Artificial Intelligence",
    '8': "Heart Failure",
    '9': "Heart Failure",
    '11': "Heart Failure",
    '12': "Time Series Forecasting",
    '13': "Time Series Forecasting",
    '14': "Time Series Forecasting",
    '15': "Time Series Forecasting",
    '16': "Time Series Forecasting",
    '17': "Transformer Model",
    '18': "Transformer Model",
    '21': "Transformer Model",
    '22': "Feature Selection",
    '23': "Feature Selection",
    '24': "Feature Selection",
    '25': "Feature Selection",
    '26': "Feature Selection"
}

def LoadTFIDFVectors(file_path):
    with open(file_path, 'r') as f:
        tfidf_vectors = json.load(f)
    return tfidf_vectors

def PrepareData(tfidf_vectors):
    X = []  # Feature vectors
    y = []  # Class labels
    
    for doc_id, vector in tfidf_vectors.items():
        X.append(vector)
        # Assuming you have a dictionary 'doc_classes' mapping document IDs to class labels
        y.append(doc_classes[doc_id])
    
    return X, y

# Load TF-IDF vectors from JSON file
tfidf_vectors = LoadTFIDFVectors('TFIDFVec.json')

# Prepare data for classification
X, y = PrepareData(tfidf_vectors)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the k-NN classifier
# k = int(input("Enter K"))  # Number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train, y_train)

# Classify the test data
y_pred = knn_classifier.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred, output_dict=True)
print(report)
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = accuracy_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Accuracy:", accuracy)


paragraph6 = """
Transformer are translating text and speech in near real-time, opening meetings and classrooms to diverse and hearing-impaired attendees. They’re helping researchers understand the chains of genes in DNA and amino acids in proteins in ways that can speed drug design.
"""
print("\n\n//////////////////////////////////////////////////////////")
Vector_for_query=QueryProcessor(paragraph6)
Vector_for_query = np.array(Vector_for_query).reshape(1, -1)
Answer=knn_classifier.predict(Vector_for_query)
print(Answer)

{'Explainable Artificial Intelligence': {'precision': 0.6666666666666666, 'recall': 1.0, 'f1-score': 0.8, 'support': 2}, 'Feature Selection': {'precision': 1.0, 'recall': 0.5, 'f1-score': 0.6666666666666666, 'support': 2}, 'accuracy': 0.75, 'macro avg': {'precision': 0.8333333333333333, 'recall': 0.75, 'f1-score': 0.7333333333333334, 'support': 4}, 'weighted avg': {'precision': 0.8333333333333333, 'recall': 0.75, 'f1-score': 0.7333333333333334, 'support': 4}}
Precision: 0.8333333333333333
Recall: 0.75
F1 Score: 0.7333333333333334
Accuracy: 0.75


//////////////////////////////////////////////////////////
['Explainable Artificial Intelligence']


In [42]:
#######################################   CLASSIFICATION USING KNN CLASSIFICATION ALGORITHM
import glob
import os
import string
from nltk.stem import PorterStemmer
import math 
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Global dictionaries for storing TF-IDF vectors and document classes
Dictionary = {}
DocClasses = {
    '1': "Explainable Artificial Intelligence",
    '2': "Explainable Artificial Intelligence",
    '3': "Explainable Artificial Intelligence",
    '7': "Explainable Artificial Intelligence",
    '8': "Heart Failure",
    '9': "Heart Failure",
    '11': "Heart Failure",
    '12': "Time Series Forecasting",
    '13': "Time Series Forecasting",
    '14': "Time Series Forecasting",
    '15': "Time Series Forecasting",
    '16': "Time Series Forecasting",
    '17': "Transformer Model",
    '18': "Transformer Model",
    '21': "Transformer Model",
    '22': "Feature Selection",
    '23': "Feature Selection",
    '24': "Feature Selection",
    '25': "Feature Selection",
    '26': "Feature Selection"
}

def load_data():
    flag = False
    if os.path.exists('Dict.json'):
        flag = True
        print("Reading Dict.json")
        with open('Dict.json', 'r') as f:
            Dictionary.update(json.load(f))
    return flag

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def tokenize_and_stem(text):
    stemmer = PorterStemmer()
    tokens = text.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

def build_dictionary():
    Folder = 'ResearchPapers'
    Pattern = '*.txt' 
    FList = glob.glob(os.path.join(Folder, Pattern)) 
    for Path in FList: 
        with open(Path, 'r') as file: 
            FileContents = file.read() 
            FileContents = preprocess_text(FileContents)
            FileContents = tokenize_and_stem(FileContents)
            File_name = int(os.path.basename(Path).split('.')[0])
            Dictionary = dictionary_builder(FileContents, File_name)
    with open('Dict.json', 'w') as f:
        json.dump(Dictionary, f)

def dictionary_builder(file, file_name):
    Stop = open(r'Stopword-List.txt', 'r')
    StopContents = Stop.read()
    StopContents = StopContents.split()
    for word in file: 
        if word not in StopContents:
            if word not in Dictionary:
                Dictionary[word] = {}
                Dictionary[word][file_name] = 1
            else:
                if file_name not in Dictionary[word]:
                    Dictionary[word][file_name] = 1
                else:
                    Dictionary[word][file_name] += 1
    return Dictionary   

def build_document_vectors():
    global DocVectors
    DocVectors = {doc_key: [0] * len(Dictionary) for doc_key in range(1, 27)}
    for index, key in enumerate(Dictionary):
        for doc_key in DocVectors.keys():
            if doc_key in Dictionary[key]:
                doc_freq = len(Dictionary[key])
                inverted_doc_freq = round(math.log(len(DocVectors) / doc_freq, 10), 2)
                tf_idf = inverted_doc_freq * Dictionary[key][doc_key] 
                DocVectors[doc_key][index] = tf_idf
    with open('TFIDFVec.json', 'w') as f:
        json.dump(DocVectors, f)

def prepare_data():
    X = []
    y = []
    for doc_id, vector in DocVectors.items():
        X.append(vector)
        y.append(DocClasses[str(doc_id)])
    return X, y

def train_knn_classifier(X_train, y_train, k):
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(X_train, y_train)
    return knn_classifier


def evaluate_classifier(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    f1_score = report['weighted avg']['f1-score']
    return report, accuracy, precision, recall, f1_score

def main():
    if not load_data():
        build_dictionary()
        build_document_vectors()
    X, y = prepare_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    k = int(input("Enter the value of K for KNN: "))
    knn_classifier = train_knn_classifier(X_train, y_train, k)
    report, accuracy, precision, recall, f1_score = evaluate_classifier(knn_classifier, X_test, y_test)
    print("Classification Report:")
    print(report)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

    # Classify a new query
    paragraph = """
    Transformers are translating text and speech in near real-time, opening meetings and classrooms to diverse and hearing-impaired attendees. They’re helping researchers understand the chains of genes in DNA and amino acids in proteins in ways that can speed drug design.
    """
    query_vector = np.array(QueryProcessor(paragraph)).reshape(1, -1)
    answer = knn_classifier.predict(query_vector)
    print("Predicted class for the query paragraph:", answer)

if __name__ == "__main__":
    main()


Reading Dict.json


Enter the value of K for KNN:  5


Classification Report:
{'Explainable Artificial Intelligence': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2}, 'Feature Selection': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 2}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 4}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 4}}
Accuracy: 0.5
Precision: 0.25
Recall: 0.5
F1 Score: 0.3333333333333333
Predicted class for the query paragraph: ['Feature Selection']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
#######################################   CLUSTERING USING K-MEANS CLUSTERING ALGORITHM

import json
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import rand_score
from collections import defaultdict

# Load TF-IDF vectors from JSON file
def load_tfidf_vectors(file_path):
    with open(file_path, 'r') as f:
        tfidf_vectors = json.load(f)
    return tfidf_vectors

# Prepare TF-IDF vectors for clustering
def prepare_data(tfidf_vectors):
    X = np.array(list(tfidf_vectors.values()))
    doc_ids = list(tfidf_vectors.keys())
    return X, doc_ids

# Perform k-Means clustering
def perform_clustering(X, k):
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    return cluster_labels, kmeans

# Evaluate clustering using Purity
def calculate_purity(cluster_labels, doc_ids, doc_classes):
    cluster_docs = defaultdict(list)
    for i, label in enumerate(cluster_labels):
        cluster_docs[label].append(doc_ids[i])
    
    purity_sum = 0
    for docs in cluster_docs.values():
        true_labels = [doc_classes[doc_id] for doc_id in docs]
        majority_class = max(set(true_labels), key=true_labels.count)
        purity_sum += true_labels.count(majority_class)
    
    purity = purity_sum / len(doc_ids)
    return purity

# Display documents in each cluster
def display_clusters(cluster_labels, doc_ids):
    cluster_docs = defaultdict(list)
    for i, label in enumerate(cluster_labels):
        cluster_docs[label].append(doc_ids[i])
    
    for cluster, docs in cluster_docs.items():
        print(f"Cluster {cluster + 1}:")
        for doc_id in docs:
            print(doc_id)
        print("=" * 50)

# Load document classes
doc_classes = {
    '1': "Explainable Artificial Intelligence",
    '2': "Explainable Artificial Intelligence",
    '3': "Explainable Artificial Intelligence",
    '7': "Explainable Artificial Intelligence",
    '8': "Heart Failure",
    '9': "Heart Failure",
    '11': "Heart Failure",
    '12': "Time Series Forecasting",
    '13': "Time Series Forecasting",
    '14': "Time Series Forecasting",
    '15': "Time Series Forecasting",
    '16': "Time Series Forecasting",
    '17': "Transformer Model",
    '18': "Transformer Model",
    '21': "Transformer Model",
    '22': "Feature Selection",
    '23': "Feature Selection",
    '24': "Feature Selection",
    '25': "Feature Selection",
    '26': "Feature Selection"
}

# Load TF-IDF vectors
tfidf_vectors = load_tfidf_vectors('TFIDFVec.json')

# Prepare data for clustering
X, doc_ids = prepare_data(tfidf_vectors)

# Choose K
k = int(input("Enter the number of clusters (K): "))

# Perform clustering
cluster_labels, kmeans = perform_clustering(X, k)

# Evaluate clustering using Purity
purity = calculate_purity(cluster_labels, doc_ids, doc_classes)

# Evaluate clustering using Silhouette Score
silhouette_avg = silhouette_score(X, cluster_labels)

# Evaluate clustering using Rand Index
rand_idx = rand_score([doc_classes[doc_id] for doc_id in doc_ids], cluster_labels)

# Display clustering results
print("Clustering Results:")
print("=" * 50)
print("Purity:", purity)
print("Silhouette Score:", silhouette_avg)
print("Rand Index:", rand_idx)
print("=" * 50)
display_clusters(cluster_labels, doc_ids)

Enter the number of clusters (K):  5


Clustering Results:
Purity: 0.4
Silhouette Score: 0.20873610487625385
Rand Index: 0.3894736842105263
Cluster 2:
1
2
3
8
9
11
13
14
15
16
17
18
21
23
24
25
Cluster 1:
7
Cluster 3:
12
Cluster 5:
22
Cluster 4:
26


