<a href="https://colab.research.google.com/github/Avinavshrestha/Avinav_Tech400/blob/main/Avinavshrestha_Week7_TECH400_Precision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import math
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt

# Define function to load documents
def load_documents(directory):
    documents = {}
    filenames = []
    for doc_id, filename in enumerate(os.listdir(directory), start=1):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            with open(filepath, 'r', encoding='utf-8') as file:
                documents[doc_id] = file.read()
                filenames.append(filename)
    return documents, filenames

def tokenize(text):
    return text.lower().split()

# Load documents from the specified directory
docs, filenames = load_documents('/content/directory')

# Tokenize all documents
tokenized_docs = [tokenize(doc) for doc in docs.values()]

# Define your queries
queries = ['shooting']
tokenized_queries = [tokenize(query) for query in queries]

print("Queries:", queries)

# Creating a sorted vocabulary from all documents
vocab = sorted(set([word for doc in tokenized_docs for word in doc]))

def term_frequency(term, document):
    return document.count(term) / len(document) if len(document) > 0 else 0

def inverse_document_frequency(term, docs):
    num_docs_containing_term = sum(1 for doc in docs if term in doc)
    return math.log(len(docs) / (1 + num_docs_containing_term)) if num_docs_containing_term > 0 else 0

def compute_tfidf_vector(document, vocab, docs):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, docs)
        tfidf_vector.append(tf * idf)
    return tfidf_vector

# Compute TF-IDF vectors for all documents
tfidf_docs = [compute_tfidf_vector(doc, vocab, tokenized_docs) for doc in tokenized_docs]

# Compute TF-IDF vectors for all queries
tfidf_queries = [compute_tfidf_vector(query, vocab, tokenized_docs) for query in tokenized_queries]

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    if magnitude == 0:
        return 0
    return dot_product / magnitude

# Compute cosine similarities between queries and documents
similarities = []
for query_vector in tfidf_queries:
    doc_similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in tfidf_docs]
    similarities.append(doc_similarities)

def rank_documents(similarities):
    ranked_docs = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    return [(filenames[i], score) for i, score in ranked_docs]

# Initialize labels
labels = np.zeros(len(tfidf_docs), dtype=int)

# Labeling: For each query, mark top N documents as relevant
N_per_query = max(2, int(0.1 * len(tfidf_docs)))  # Label top 10% or at least 2 documents per query

for query in queries:
    query_vec = compute_tfidf_vector(tokenize(query), vocab, tokenized_docs)
    # Compute cosine similarities
    similarities_query = [cosine_similarity(query_vec, doc_vector) for doc_vector in tfidf_docs]
    # Get indices of top N similarities
    top_n_indices = np.argsort(similarities_query)[-N_per_query:]
    # Label these documents as relevant
    for idx in top_n_indices:
        labels[idx] = 1

# Check label distribution
num_relevant = sum(labels)
num_non_relevant = len(labels) - num_relevant
print(f"Number of Relevant Documents (1): {num_relevant}")
print(f"Number of Non-Relevant Documents (0): {num_non_relevant}")

# Ensure both classes have at least two samples
if num_relevant < 2 or num_non_relevant < 2:A
    raise ValueError("Insufficient samples in one of the classes after labeling. Adjust N or labeling strategy.")

# Convert TF-IDF lists to numpy arrays
X = np.array(tfidf_docs)
y = np.array(labels)

# Split data into training and testing sets
# Using stratify to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize and train Logistic Regression model
clf = LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced')
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate Precision, Recall, and Accuracy
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
accuracy = accuracy_score(y_test, y_pred)

# Print logistic regression results
print(f"Logistic Regression Precision: {precision:.4f}")
print(f"Logistic Regression Recall: {recall:.4f}")
print(f"Logistic Regression Accuracy: {accuracy:.4f}")

# Save the results (precision, recall, accuracy) to the result_Avinav.txt file
with open("result_Avinav.txt", 'w') as file:
    file.write(f"Logistic Regression Precision: {precision:.4f}\n")
    file.write(f"Logistic Regression Recall: {recall:.4f}\n")
    file.write(f"Logistic Regression Accuracy: {accuracy:.4f}\n")

print("Results saved to result_Avinav.txt")



Queries: ['shooting']
Number of Relevant Documents (1): 3
Number of Non-Relevant Documents (0): 35
Logistic Regression Precision: 0.0000
Logistic Regression Recall: 0.0000
Logistic Regression Accuracy: 0.8750
Results saved to result_Avinav.txt
