# Step 1: Environment Setup
In this step, we import the necessary Python libraries for our analysis.
We import the pandas library for data manipulation, TfidfVectorizer from sklearn for creating the Document-Term Matrix (DTM), and numpy for numerical operations.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Step 2: Data Preparation
Here, we create a sample dataframe (df) with text data. This dataframe will serve as the basis for our DTM creation.

In [None]:

df = pd.read_csv('processed_patient_notes.csv')
processed_notes = df['processed_pn_history']

# Step 3: Document-Term Matrix (DTM) Creation
In this step, we configure the TfidfVectorizer with appropriate hyperparameters and create the DTM.
We use TfidfVectorizer to convert the text data into a DTM. Considering n-grams of size 1 to 2, and not limiting the maximum number of features.

In [None]:
# Configure TfidfVectorizer with additional parameters to reduce the number of features
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=45000,  # Limit the number of features
    min_df=5,  # Terms must appear in at least 5 documents
    max_df=0.5  # Terms must not appear in more than 50% of the documents
)
dtm_df = vectorizer.fit_transform(df['processed_pn_history'])

# Avoid converting the entire DTM to a dense array; work with it sparsely
# If you need to display or analyze the DTM, consider using sparse matrix operations or analyzing subsets



# Step 4: Describe the DTM
Now, let's describe the DTM in terms of its size and memory usage.

DTM Shape: (5, 21)  # Number of documents x Number of terms

Memory Usage: 8480 bytes  # Memory usage in bytes
We display the shape of the DTM, which tells us the number of documents and terms in the matrix. Additionally, we calculate and display the memory usage of the DTM.

In [None]:
# Function to estimate memory usage of csr_matrix
def estimate_csr_memory_usage(csr_matrix):
    """Estimate memory usage of a csr_matrix."""
    memory_usage_bytes = (csr_matrix.data.nbytes + csr_matrix.indptr.nbytes + csr_matrix.indices.nbytes)
    return memory_usage_bytes

# Display the shape of the DTM and calculate its memory usage
print("DTM Shape:", dtm_df.shape)  # Documents x Terms
memory_usage_bytes = estimate_csr_memory_usage(dtm_df)
print("Memory Usage:", memory_usage_bytes, "bytes")


DTM Shape: (42146, 45000)
Memory Usage: 72313488 bytes


# Step 5: Display Top Terms by Frequency
In this step, we find and display the top terms by frequency in the DTM.

In [None]:
# Sum term frequencies across all documents to get the total frequency of each term
term_frequencies = np.sum(dtm_df.toarray(), axis=0)
# Create a DataFrame for terms and their frequencies
terms_df = pd.DataFrame({'term': vectorizer.get_feature_names_out(), 'frequency': term_frequencies})
# Sort by frequency to find the top terms
top_terms = terms_df.sort_values(by='frequency', ascending=False).head(10)
print("\nTop Terms by Frequency:")
print(top_terms)


Top Terms by Frequency:
          term   frequency
21758     last  907.408518
24594    month  905.346355
13052  episode  867.432639
29698  patient  835.917855
43849     work  820.696479
29938   period  785.172123
14464     feel  767.219982
33520   report  700.880291
31913       pt  679.312567
29572     past  670.249942


# Step 6: Display a Sample of the Document-Term Matrix
We can display a few rows of the DTM DataFrame to understand its structure better.

In [None]:
from scipy.sparse import csr_matrix
import pandas as pd

# Assuming 'dtm_df' is your Document-Term Matrix in sparse format (csr_matrix)
# and 'vectorizer' is an instance of TfidfVectorizer

# Function to display a portion of the DTM
def display_sparse_matrix_sample(sparse_matrix, vectorizer, num_docs=5, num_terms=10):
    """
    Displays a sample of the DTM as a dense DataFrame.

    Parameters:
    - sparse_matrix: The sparse DTM (csr_matrix).
    - vectorizer: The TfidfVectorizer instance used to generate the DTM.
    - num_docs: Number of documents to display (rows).
    - num_terms: Number of terms to display (columns).
    """
    feature_names = vectorizer.get_feature_names_out()
    # Ensure we don't exceed the actual number of documents or terms
    num_docs = min(num_docs, sparse_matrix.shape[0])
    num_terms = min(num_terms, sparse_matrix.shape[1])

    # Select a subset of the matrix (for both documents and terms)
    doc_indices = range(num_docs)  # First 'num_docs' documents
    term_indices = range(num_terms)  # First 'num_terms' terms

    # Create a dense DataFrame for the subset
    dense_subset = sparse_matrix[doc_indices, :][:, term_indices].toarray()
    df_subset = pd.DataFrame(dense_subset, columns=feature_names[:num_terms], index=[f"Doc_{i}" for i in doc_indices])

    return df_subset

# Use the function to display a sample of the DTM

dtm_df_sample = display_sparse_matrix_sample(dtm_df, vectorizer, num_docs=5, num_terms=10)
print("\nDocument-Term Matrix (Sample):")
print(dtm_df_sample)


def summarize_sparse_matrix(sparse_matrix):
    """Summarizes key statistics of a sparse matrix."""
    # Basic dimensions
    num_documents, num_features = sparse_matrix.shape
    # Number of non-zero elements
    non_zeros = sparse_matrix.nnz
    # Density of the matrix
    density = non_zeros / (num_documents * num_features)

    print(f"Matrix Dimensions: {num_documents} documents x {num_features} features")
    print(f"Number of Non-Zero Elements: {non_zeros}")
    print(f"Density: {density:.4f}")

# Call the function to summarize the DTM
summarize_sparse_matrix(dtm_df)




Document-Term Matrix (Sample):
        00   02  02 03   03  03 04  03 10   04  04 05   05  05 06
Doc_0  0.0  0.0    0.0  0.0    0.0    0.0  0.0    0.0  0.0    0.0
Doc_1  0.0  0.0    0.0  0.0    0.0    0.0  0.0    0.0  0.0    0.0
Doc_2  0.0  0.0    0.0  0.0    0.0    0.0  0.0    0.0  0.0    0.0
Doc_3  0.0  0.0    0.0  0.0    0.0    0.0  0.0    0.0  0.0    0.0
Doc_4  0.0  0.0    0.0  0.0    0.0    0.0  0.0    0.0  0.0    0.0
Matrix Dimensions: 42146 documents x 45000 features
Number of Non-Zero Elements: 6012075
Density: 0.0032
