In [1]:
import glob
import gzip
import re

doc_pattern = re.compile(r'<DOC>(.*?)</DOC>', re.DOTALL)
docno_pattern = re.compile(r'<DOCNO>\s*(.*?)\s*</DOCNO>')
head_pattern = re.compile(r'<HEAD>\s*(.*?)\s*</HEAD>')
text_pattern = re.compile(r'<TEXT>\s*(.*?)\s*</TEXT>', re.DOTALL)

def get_documents() :
    documents = []
    # Get a list of all .gz files in the "Ap" directory
    file_list = glob.glob('TREC AP 88-90/TREC AP 88-90/collection de documents/AP/*.gz') # start from ../src

    # Loop over the list of files
    for filename in file_list:
        
        # Open the .gz file
        with gzip.open(filename, 'rt', encoding='latin1') as file:  # 'rt' mode for text reading
            # Read the contents of the file
            content = file.read()
            for doc in doc_pattern.finditer(content):
                doc_content = doc.group(1)

                # Extracting individual elements
                doc_id = docno_pattern.search(doc_content).group(1)
                head = head_pattern.search(doc_content)
                text = text_pattern.search(doc_content)
                
                documents.append([doc_id, head.group(1) if head else '',text.group(1) if text else ''])

    return documents

documents = get_documents()
print(documents[0]) 

['AP880212-0001', 'Reports Former Saigon Officials Released from Re-education Camp', "More than 150 former officers of the\noverthrown South Vietnamese government have been released from a\nre-education camp after 13 years of detention, the official Vietnam\nNews Agency reported Saturday.\n   The report from Hanoi, monitored in Bangkok, did not give\nspecific figures, but said those freed Friday included an\nex-Cabinet minister, a deputy minister, 10 generals, 115\nfield-grade officers and 25 chaplains.\n   It quoted Col. Luu Van Ham, director of the Nam Ha camp south of\nHanoi, as saying all 700 former South Vietnamese officials who had\nbeen held at the camp now have been released.\n   They were among 1,014 South Vietnamese who were to be released\nfrom re-education camps under an amnesty announced by the Communist\ngovernment to mark Tet, the lunar new year that begins Feb. 17.\n   The Vietnam News Agency report said many foreign journalists and\na delegation from the Australia-Viet

## co_occurrences analysis

In [2]:
import nltk
from nltk import bigrams
from collections import Counter, defaultdict

# tokenise documents
words = [nltk.word_tokenize(doc[2]) for doc in documents]

# Define window size for co-occurrence
window_size = 2

# Create co-occurrence matrix
co_occurrences = defaultdict(Counter)

In [4]:
for i in range(len(words)):
    for j in range(i+1, min(i+1+window_size, len(words))):
        w1, w2 = sorted([tuple(words[i]), tuple(words[j])])  # Convert lists to tuples and sort
        if w1 != w2:  # Avoid self co-occurrences
            co_occurrences[w1][w2] += 1

In [None]:
# Print co-occurrence counts
for w1, counts in co_occurrences.items():
    for w2, count in counts.items():
        if count > 1:
            print(f"Co-occurrence for ({w1}, {w2}): {count}")

##  Mutual Information analysis

In [2]:
!python3 -m pip install numpy scikit-learn



In [5]:
print(len(documents))

242918


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mutual_info_score
import numpy as np

# Vectorize the corpus
# Convert list of lists to list of strings
documents = [' '.join(doc) for doc in documents]

# Vectorize the corpus
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)  # Keep as sparse matrix

# Get feature names
features = vectorizer.get_feature_names_out()

# Calculate MI for each pair of features (words)
n_features = len(features)
MI_matrix = np.zeros((n_features, n_features))
for i in range(n_features):
    for j in range(n_features):
        MI_matrix[i, j] = mutual_info_score(X[:, i].toarray().ravel(), X[:, j].toarray().ravel())

# Print MI values
print("Mutual Information Matrix:")
print(MI_matrix)

ValueError: empty vocabulary; perhaps the documents only contain stop words

## Relevant info 

In [8]:
!python3 -m pip install chardet
!python3 -m pip install nltk

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0


In [10]:
import chardet
file_path = 'AP/AP880213'
with open(file_path, 'rb') as file:
    rawdata = file.read()
    result = chardet.detect(rawdata)
    encoding = result['encoding']

with open(file_path, 'r', encoding=encoding) as file:
    content = file.read()

ascii


In [20]:
import os
from collections import Counter
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))  # Use 'english' or another language

# Path to the TREC AP 88-90 dataset directory
dataset_path = 'AP/'

# Initialize variables to store data
doc_count = 0
word_count = 0
word_freq = Counter()

# Iterate over the files in the dataset directory
for filename in os.listdir(dataset_path):
    file_path = os.path.join(dataset_path, filename)
    with open(file_path, 'rb') as file:
        rawdata = file.read()
        result = chardet.detect(rawdata)
        encoding = result['encoding']

    with open(file_path, 'r', encoding=encoding,  errors='ignore') as file:
        content = file.read()
        filtered_words = [word for word in word_tokenize(content) if word.lower() not in stop_words]
        word_freq.update(filtered_words)
       

# Calculate additional statistics
# unique_word_count = len(word_freq)
# average_words_per_doc = word_count / doc_count if doc_count else 0

# Print the analysis
# print(f"Number of documents: {doc_count}")
# print(f"Total word count: {word_count}")
# print(f"Unique word count: {unique_word_count}")
# print(f"Average words per document: {average_words_per_doc:.2f}")
print(f"Top 10 most common words: {word_freq.most_common(10)}")


[nltk_data] Downloading package stopwords to /home/frank/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/frank/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 10 most common words: [(',', 6508223), ('.', 5188065), ('<', 4915756), ('>', 4915756), ('said', 1446450), ('``', 1376639), ("''", 1349608), ("'s", 984940), ('$', 415036), (')', 364081)]


In [18]:
print(type(stop_words))
stop_words.update([',','.','<','>','``',"''","'s",'$',')','(',])

<class 'set'>
