# Libraries

In [23]:
import os
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import math
from collections import defaultdict
import pandas as pd

# Task 1: Preproccessing 

In [24]:
def preprocessing(doc):
    stop_words = set(stopwords.words('english'))
    translator = str.maketrans('', '', string.punctuation)
    words = doc.lower().translate(translator).split()
    return [word for word in words if word not in stop_words]

def preprocess_data(docList):
    return [preprocessing(doc) for doc in docList]

# Task 2: Count Lines & Count Words

In [25]:
def count_lines(file_path):
    with open(file_path, 'r') as file:
        return len(file.readlines())

def count_words(file_path):
    with open(file_path, 'r') as file:
        return len(file.read().split())

# Task 3: Generate DataFrame with stop words and their count

In [26]:
def stop_words_count_df(docList):
    stop_words = set(stopwords.words('english'))
    stop_words_count = {word: sum(doc.lower().count(word) for doc in docList) for word in stop_words}
    stop_words_df = pd.DataFrame({'Stop Word': list(stop_words_count.keys()), 'Count': list(stop_words_count.values())})
    return stop_words_df

# Task 4: Generate .txt file without stopwords

In [27]:
def remove_stop_words(docList):
    stop_words = set(stopwords.words('english'))
    for idx, doc in enumerate(docList):
        filename = f"BC190406208_exclude_stopwords_{os.path.splitext(os.path.basename(doc_names[idx]))[0]}.txt"
        cleaned_content = " ".join(word for word in preprocessing(doc) if word not in stop_words)
        output_file_path = os.path.join(directory_path, filename)  # Path for the new file
        with open(output_file_path, 'w') as output_file:
            output_file.write(cleaned_content)

# Task 5: Count of Lowercase Words

In [28]:
def lowercase_words_count(docList):
    lowercase_count = sum(sum(1 for word in preprocessing(doc) if word.islower()) for doc in docList)
    print(f"Count of lowercase words: {lowercase_count}")

# Task 6: Perform Lemmatization & generate .txt files

In [29]:
def perform_lemmatization(docList):
    lemmatizer = WordNetLemmatizer()
    for idx, doc in enumerate(docList):
        filename = f"BC190406208_lemmatized_{os.path.splitext(os.path.basename(doc_names[idx]))[0]}.txt"
        words = word_tokenize(doc)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        lemmatized_content = ' '.join(lemmatized_words)
        output_file_path = os.path.join(directory_path, filename)  # Path for the new file
        with open(output_file_path, 'w') as output_file:
            output_file.write(lemmatized_content)

#  Task 7: Perform Stemming & generate .txt files

In [30]:
def perform_stemming(docList):
    stemmer = PorterStemmer()
    for idx, doc in enumerate(docList):
        filename = f"BC190406208_stemming_{os.path.splitext(os.path.basename(doc_names[idx]))[0]}.txt"
        words = word_tokenize(doc)
        stemmed_words = [stemmer.stem(word) for word in words]
        stemmed_content = ' '.join(stemmed_words)
        output_file_path = os.path.join(directory_path, filename)  # Path for the new file
        with open(output_file_path, 'w') as output_file:
            output_file.write(stemmed_content)

# Task 8: Read All .txt files & Creating List of Words for every document

In [31]:
def read_text_files_from_directory(directory_path):
    docList = []
    doc_names = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt') and os.path.isfile(os.path.join(directory_path, filename)):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r') as file:
                docList.append(file.read())
                doc_names.append(filename)
    return docList, doc_names

def wordList(doc):
    return words_list(doc)

# Task 9: Remove Punctuations

In [32]:
def removePuncs(wordList):
    stop_words = set(stopwords.words('english'))
    cleaned_words = [word for word in wordList if word not in stop_words]
    # Remove punctuation from each word
    cleaned_words = [''.join(char for char in word if char not in string.punctuation) for word in cleaned_words]
    cleaned_words = [word for word in cleaned_words if word.isalpha()]
    return cleaned_words

# Task 10: Term Frequency in Doc

In [33]:
def termFrequencyInDoc(wordList):
    total_words = len(wordList)
    term_freq_dict = {}
    for word in wordList:
        term_freq_dict[word] = term_freq_dict.get(word, 0) + 1
    term_frequency_dict = {word: freq / total_words for word, freq in term_freq_dict.items()}
    return term_frequency_dict

#  Task 11: Word Doc Frequency

In [34]:
def wordDocFrequency(dicList):
    word_doc_freq_dict = {}
    for doc_dict in dicList:
        words_in_doc = set(doc_dict.keys())
        for word in words_in_doc:
            word_doc_freq_dict[word] = word_doc_freq_dict.get(word, 0) + 1
    return word_doc_freq_dict

# Task 12: Inverse Doc Frequency

In [35]:
def inverseDocFrequency(dicList):
    total_docs = len(dicList)
    word_idf_dict = {}
    for word, doc_freq in dicList.items():
        idf = 1 + math.log(total_docs / (1 + doc_freq))
        word_idf_dict[word] = idf
    return word_idf_dict

# Task 13: TF-IDF

In [36]:
def tfidf(docList):
    tfidf_list = []
    dicList = [termFrequencyInDoc(removePuncs(preprocessing(doc))) for doc in docList]
    vocabulary = wordDocFrequency(dicList)
    idf = inverseDocFrequency(vocabulary)
    for doc in docList:
        words = removePuncs(preprocessing(doc))
        term_freq_dict = termFrequencyInDoc(words)
        tfidf_dict = {word: tf * idf[word] for word, tf in term_freq_dict.items()}
        tfidf_list.append(tfidf_dict)
    return tfidf_list

# Task 14: Vector Space Model

In [37]:
def vectorSpaceModel(query, tfidf_list, docList):
    query_words = removePuncs(preprocessing(query))
    query_term_freq = termFrequencyInDoc(query_words)

    query_idf = {}
    for word in query_term_freq.keys():
        if word in idf:
            query_idf[word] = idf[word]
        else:
            query_idf[word] = 1

    query_vector = {word: tf * query_idf[word] for word, tf in query_term_freq.items()}

    similarity_scores = defaultdict(float)
    for doc_idx, doc_vector in enumerate(tfidf_list):
        for word, tfidf in doc_vector.items():
            if word in query_vector:
                similarity_scores[doc_idx] += query_vector[word] * tfidf

    sorted_documents = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_documents

Example Queries

In [38]:
queries = [
    "Computer Graphics",
    "natural settings",
    "natural language system"
]

Reading all text files from the current working directory

In [39]:
folder_name = "ACL_DATASET"
directory_path = os.path.join(os.getcwd(), folder_name)

Calling All Functions:

In [41]:
try:
    # Read all text files from the ACL_DATASET folder
    docList, doc_names = read_text_files_from_directory(directory_path)
except FileNotFoundError:
    print(f"Error: The folder '{folder_name}' was not found in the current directory.")
    exit()

# Task 1: Preprocessing of data
preprocessed_docList = preprocess_data(docList)

# Task 2: Count number of lines and words in each text file
line_counts = [count_lines(os.path.join(directory_path, file)) for file in doc_names]
word_counts = [count_words(os.path.join(directory_path, file)) for file in doc_names]

# Task 3: Generate DataFrame with stop words and their count
stop_words_df = stop_words_count_df(docList)

# Task 4: Generate .txt file excluding stop words
remove_stop_words(docList)

# Task 5: Print count of lowercase words in the file
lowercase_words_count(docList)

# Task 6: Perform lemmatization on each document and generate corresponding files
perform_lemmatization(docList)

# Task 7: Perform stemming on each document and generate corresponding files
perform_stemming(docList)

# Calculating TF-IDF scores for documents
tfidf_list = tfidf(docList)

# Calculating IDF scores for words in the vocabulary
vocabulary = wordDocFrequency([termFrequencyInDoc(removePuncs(preprocessing(doc))) for doc in docList])
idf = inverseDocFrequency(vocabulary)

Count of lowercase words: 179856


Retrieving relevant documents for each query

In [42]:
for query in queries:
    print(f"Query: {query}")
    top_5_documents = vectorSpaceModel(query, tfidf_list, docList)[:5]
    for rank, (doc_idx, score) in enumerate(top_5_documents):
        doc_filename = doc_names[doc_idx]
        print(f"Rank {rank+1}: {doc_filename} (Score: {score:.4f})")
        with open(os.path.join(directory_path, doc_filename), 'r') as file:
            print(file.read()[:500])  # Display the first 500 characters of the document
        print()
    print("-" * 50)

Query: Computer Graphics
Rank 1: A00-1007.pdf.txt (Score: 0.1250)
Disti l l ing dialogues - A method using natural dialogue 
dialogue systems development 
Arne  JSnsson  and  N i l s  Dah lb~ick  
Depar tment  of Computer  and  In format ion  Sc ience 
L inkSp ing  Un ivers i ty  
S-581 83, L INKOPING 
SWEDEN 
nilda@ida.liu.se, arnjo@ida.liu.se 
corpora for 
Abst ract  
We report on a method for utilising corpora col- 
lected in natural settings. It is based on distilling 
(re-writing) natural dialogues to elicit the type of 
dialogue that would occur if one t

Rank 2: A00-1046.pdf.txt (Score: 0.1171)
The Efficiency of Multimodal Interaction for a Map-based Task 
Philip COHEN, David McGEE, Josh CLOW 
Center for Human-Computer Communication 
Oregon Graduate Institute of Science & Technology 
20000 N.W. Walker Road 
Beaverton, Oregon 97006 
{ pcohen, dmcgee } @cse.ogi.edu 
Abstract 
This paper compares the efficiency of using a 
standard direct-manipulation graphical user 
interface (GUI