In [1]:
import time
import numpy as np
from numba import jit
from joblib import Parallel, delayed
from functools import lru_cache

# Sample text data
text_data = "This is a sample text for text processing benchmarking."

# Task 1: Text Tokenization
def tokenize(text):
    return text.split()

@jit
def tokenize_numba(text):
    return text.split()

def tokenize_joblib(text):
    return Parallel(n_jobs=-1)(delayed(tokenize)(word) for word in text.split())

@lru_cache(maxsize=None)
def tokenize_memoized(text):
    return text.split()

def tokenize_standard(text):
    return text.split()

# Task 2: Text Normalization
def normalize(text):
    return text.lower()

@jit
def normalize_numba(text):
    return text.lower()

def normalize_joblib(text):
    return Parallel(n_jobs=-1)(delayed(normalize)(word) for word in text.split())

@lru_cache(maxsize=None)
def normalize_memoized(text):
    return text.lower()

def normalize_standard(text):
    return text.lower()

# Task 3: Feature Extraction
def extract_features(text):
    return [len(word) for word in text.split()]

@jit
def extract_features_numba(text):
    return [len(word) for word in text.split()]

def extract_features_joblib(text):
    return Parallel(n_jobs=-1)(delayed(len)(word) for word in text.split())

@lru_cache(maxsize=None)
def extract_features_memoized(text):
    return [len(word) for word in text.split()]

def extract_features_standard(text):
    return [len(word) for word in text.split()]

# Measure execution time for each technique on each task
tasks = {
    "Tokenization": (tokenize, tokenize_numba, tokenize_joblib, tokenize_memoized, tokenize_standard),
    "Normalization": (normalize, normalize_numba, normalize_joblib, normalize_memoized, normalize_standard),
    "Feature Extraction": (extract_features, extract_features_numba, extract_features_joblib, extract_features_memoized, extract_features_standard)
}

for task, functions in tasks.items():
    print(f"Task: {task}")
    for func in functions:
        start_time = time.time()
        result = func(text_data)
        execution_time = time.time() - start_time
        print(f"{func.__name__}: {execution_time:.6f} seconds")
    print()


Task: Tokenization
tokenize: 0.000001 seconds
tokenize_numba: 2.502573 seconds
tokenize_joblib: 0.212139 seconds
tokenize_memoized: 0.000010 seconds
tokenize_standard: 0.000003 seconds

Task: Normalization
normalize: 0.000003 seconds
normalize_numba: 1.953045 seconds
normalize_joblib: 0.011228 seconds
normalize_memoized: 0.000005 seconds
normalize_standard: 0.000001 seconds

Task: Feature Extraction
extract_features: 0.000004 seconds
extract_features_numba: 0.980209 seconds
extract_features_joblib: 0.011143 seconds
extract_features_memoized: 0.000006 seconds
extract_features_standard: 0.000003 seconds



In [11]:
import time
from joblib import Parallel, delayed
from functools import lru_cache
from numba import jit

# Task 1: Text Tokenization
def tokenize(text):
    return text.split()

def tokenize_joblib(text):
    return Parallel(n_jobs=-1)(delayed(tokenize)(word) for word in text.split())

@lru_cache(maxsize=None)
def tokenize_memoized(text):
    return text.split()

@jit
def tokenize_numba(text):
    return text.split()

def tokenize_standard(text):
    return text.split()

# Task 2: Text Normalization
def normalize(text):
    return text.lower()

def normalize_joblib(text):
    return Parallel(n_jobs=-1)(delayed(normalize)(word) for word in text.split())

@lru_cache(maxsize=None)
def normalize_memoized(text):
    return text.lower()

@jit
def normalize_numba(text):
    return text.lower()

def normalize_standard(text):
    return text.lower()

# Task 3: Feature Extraction
def extract_features(text):
    return [len(word) for word in text.split()]

def extract_features_joblib(text):
    return Parallel(n_jobs=-1)(delayed(len)(word) for word in text.split())

@lru_cache(maxsize=None)
def extract_features_memoized(text):
    return [len(word) for word in text.split()]

@jit
def extract_features_numba(text):
    return [len(word) for word in text.split()]

def extract_features_standard(text):
    return [len(word) for word in text.split()]

# Read text from file
def read_text_from_file(filename):
    with open(filename, 'r') as file:
        return file.read()

# Measure execution time for each technique on each task
tasks = {
    "Tokenization": (tokenize_joblib, tokenize_memoized, tokenize_numba, tokenize_standard),
    "Normalization": (normalize_joblib, normalize_memoized, normalize_numba, normalize_standard),
    "Feature Extraction": (extract_features_joblib, extract_features_memoized, extract_features_numba, extract_features_standard)
}

# Filename of the text document
filename = "/home/jas/Documents/example txt"

# Read text from file
text_data = read_text_from_file(filename)

# Measure and print execution times for each task and each technique
for task, functions in tasks.items():
    print(f"Task: {task}")
    for func in functions:
        start_time = time.time()
        result = func(text_data)
        execution_time = time.time() - start_time
        print(f"{func.__name__}: {execution_time:.6f} seconds")
    print()


Task: Tokenization
tokenize_joblib: 0.082089 seconds
tokenize_memoized: 0.000886 seconds
tokenize_numba: 0.955818 seconds
tokenize_standard: 0.000385 seconds

Task: Normalization
normalize_joblib: 0.068397 seconds
normalize_memoized: 0.000218 seconds
normalize_numba: 0.744705 seconds
normalize_standard: 0.000072 seconds

Task: Feature Extraction
extract_features_joblib: 0.061974 seconds
extract_features_memoized: 0.000749 seconds
extract_features_numba: 1.073678 seconds
extract_features_standard: 0.000670 seconds



In [16]:
import time
from joblib import Parallel, delayed
from functools import lru_cache
from numba import jit

# Task 1: Text Tokenization
def tokenize(text):
    return text.split()

def tokenize_joblib(text):
    return Parallel(n_jobs=-1)(delayed(tokenize)(word) for word in text.split())

@lru_cache(maxsize=None)
def tokenize_memoized(text):
    return text.split()

@jit
def tokenize_numba(text):
    return text.split()

def tokenize_standard(text):
    return text.split()

# Task 2: Text Normalization
def normalize(text):
    return text.lower()

def normalize_joblib(text):
    return Parallel(n_jobs=-1)(delayed(normalize)(word) for word in text.split())

@lru_cache(maxsize=None)
def normalize_memoized(text):
    return text.lower()

@jit
def normalize_numba(text):
    return text.lower()

def normalize_standard(text):
    return text.lower()

# Task 3: Feature Extraction
def extract_features(text):
    return [len(word) for word in text.split()]

def extract_features_joblib(text):
    return Parallel(n_jobs=-1)(delayed(len)(word) for word in text.split())

@lru_cache(maxsize=None)
def extract_features_memoized(text):
    return [len(word) for word in text.split()]

@jit
def extract_features_numba(text):
    return [len(word) for word in text.split()]

def extract_features_standard(text):
    return [len(word) for word in text.split()]

# Additional functionalities for each task

# Task 1: Text Tokenization
def tokenize_with_index(text):
    return [(i, word) for i, word in enumerate(text.split())]

# Task 2: Text Normalization
def normalize_with_index(text):
    return [(i, normalize(word)) for i, word in enumerate(text.split())]

# Task 3: Feature Extraction
def extract_features_with_index(text):
    return [(i, len(word)) for i, word in enumerate(text.split())]

# Read text from file
def read_text_from_file(filename):
    with open(filename, 'r') as file:
        return file.read()

# Measure execution time for each technique on each task
tasks = {
    "Tokenization": (tokenize_joblib, tokenize_memoized, tokenize_numba, tokenize_standard, tokenize_with_index),
    "Normalization": (normalize_joblib, normalize_memoized, normalize_numba, normalize_standard, normalize_with_index),
    "Feature Extraction": (extract_features_joblib, extract_features_memoized, extract_features_numba, extract_features_standard, extract_features_with_index)
}

# Filename of the text document
filename = "/home/jas/Documents/example txt"

# Read text from file
text_data = read_text_from_file(filename)

# Measure and print execution times for each task and each technique
for task, functions in tasks.items():
    print(f"Task: {task}")
    for func in functions:
        start_time = time.time()
        result = func(text_data)
        execution_time = time.time() - start_time
        print(f"{func.__name__}: {execution_time:.6f} seconds")
    print()




Task: Tokenization
tokenize_joblib: 0.078413 seconds
tokenize_memoized: 0.000624 seconds
tokenize_numba: 0.960065 seconds
tokenize_standard: 0.000260 seconds
tokenize_with_index: 0.000888 seconds

Task: Normalization
normalize_joblib: 0.074537 seconds
normalize_memoized: 0.000215 seconds
normalize_numba: 0.732997 seconds
normalize_standard: 0.000073 seconds
normalize_with_index: 0.002141 seconds

Task: Feature Extraction
extract_features_joblib: 0.065248 seconds
extract_features_memoized: 0.000611 seconds
extract_features_numba: 0.960474 seconds
extract_features_standard: 0.000547 seconds
extract_features_with_index: 0.000944 seconds



In [24]:
import time
import numpy as np
from numba import jit
import spacy

# Sample text document
text = "Apple Inc. is an American multinational technology company headquartered in Cupertino, California."

# Standard Python Tokenization
def tokenize_python(text):
    return text.split()

# Numpy Tokenization
def tokenize_numpy(text):
    return np.array(text.split())

# Numba Tokenization
@jit(nopython=True)
def tokenize_numba(text):
    return text.split()

# Standard Python Vectorization (Bag-of-Words)
def vectorize_python(tokens):
    word_dict = {}
    for token in tokens:
        word_dict[token] = word_dict.get(token, 0) + 1
    return word_dict

# Numpy Vectorization (Bag-of-Words)
def vectorize_numpy(tokens):
    unique_tokens, counts = np.unique(tokens, return_counts=True)
    return dict(zip(unique_tokens, counts))

# Standard Python Named Entity Recognition (NER)
def ner_python(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


# Measure execution time for each approach
start_time = time.time()
tokens_py = tokenize_python(text)
print("Standard Python Tokenization:", tokens_py)
print("Time taken:", time.time() - start_time)

start_time = time.time()
tokens_np = tokenize_numpy(text)
print("Numpy Tokenization:", tokens_np)
print("Time taken:", time.time() - start_time)

start_time = time.time()
tokens_nb = tokenize_numba(text)
print("Numba Tokenization:", tokens_nb)
print("Time taken:", time.time() - start_time)

start_time = time.time()
bow_py = vectorize_python(tokens_py)
print("Standard Python Bag-of-Words:", bow_py)
print("Time taken:", time.time() - start_time)

start_time = time.time()
bow_np = vectorize_numpy(tokens_np)
print("Numpy Bag-of-Words:", bow_np)
print("Time taken:", time.time() - start_time)

start_time = time.time()
entities_py = ner_python(text)
print("Standard Python Named Entity Recognition:", entities_py)
print("Time taken:", time.time() - start_time)



Standard Python Tokenization: ['Apple', 'Inc.', 'is', 'an', 'American', 'multinational', 'technology', 'company', 'headquartered', 'in', 'Cupertino,', 'California.']
Time taken: 0.00011801719665527344
Numpy Tokenization: ['Apple' 'Inc.' 'is' 'an' 'American' 'multinational' 'technology'
 'company' 'headquartered' 'in' 'Cupertino,' 'California.']
Time taken: 0.0001537799835205078
Numba Tokenization: ['Apple', 'Inc.', 'is', 'an', 'American', 'multinational', 'technology', 'company', 'headquartered', 'in', 'Cupertino,', 'California.']
Time taken: 0.9556922912597656
Standard Python Bag-of-Words: {'Apple': 1, 'Inc.': 1, 'is': 1, 'an': 1, 'American': 1, 'multinational': 1, 'technology': 1, 'company': 1, 'headquartered': 1, 'in': 1, 'Cupertino,': 1, 'California.': 1}
Time taken: 9.989738464355469e-05
Numpy Bag-of-Words: {'American': 1, 'Apple': 1, 'California.': 1, 'Cupertino,': 1, 'Inc.': 1, 'an': 1, 'company': 1, 'headquartered': 1, 'in': 1, 'is': 1, 'multinational': 1, 'technology': 1}
Time

In [56]:
import time
import numpy as np
from gensim.models import Word2Vec
from joblib import Parallel, delayed
import joblib
from functools import lru_cache

# Function to read a large text file
def read_large_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        corpus = file.readlines()
    return corpus

# Load the large text file as the corpus
large_corpus_file = "/home/jas/Documents/example txt"

corpus = read_large_text_file(large_corpus_file)

# Function to train Word2Vec model (mock implementation)
def train_word2vec(corpus):
    sentences = [sentence.split() for sentence in corpus]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    return model

# Standard Python training of Word2Vec model
def train_word2vec_python(corpus):
    return train_word2vec(corpus)

# Numpy training of Word2Vec model
def train_word2vec_numpy(corpus):
    return train_word2vec(corpus)

# Memoized training of Word2Vec model
@lru_cache(maxsize=None)
def train_word2vec_memoized(corpus):
    return train_word2vec(corpus)

# Joblib training of Word2Vec model
def train_word2vec_joblib(corpus):
    return train_word2vec(corpus)

# Measure execution time for each approach
start_time = time.time()
model_py = train_word2vec_python(corpus)
print("Standard Python Word2Vec Training:")
print("Time taken:", time.time() - start_time)

start_time = time.time()
model_np = train_word2vec_numpy(corpus)
print("Numpy Word2Vec Training:")
print("Time taken:", time.time() - start_time)

start_time = time.time()
model_mm = train_word2vec_memoized(tuple(corpus))
print("Memoized Word2Vec Training:")
print("Time taken:", time.time() - start_time)

start_time = time.time()
model_joblib = Parallel(n_jobs=-1)(delayed(train_word2vec_joblib)(corpus) for _ in range(10))
print("Joblib Word2Vec Training:")
print("Time taken:", time.time() - start_time)



Standard Python Word2Vec Training:
Time taken: 0.012315988540649414
Numpy Word2Vec Training:
Time taken: 0.010944843292236328
Memoized Word2Vec Training:
Time taken: 0.009955406188964844
Joblib Word2Vec Training:
Time taken: 0.17759943008422852


In [86]:
import time
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from functools import lru_cache
from joblib import Parallel, delayed
import joblib

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load stop words
stop_words = set(stopwords.words('english'))

# Function to read a large text file
def read_large_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Load the large text file
large_text_file = "/home/jas/Documents/example txt"
text = read_large_text_file(large_text_file)

# Function to preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove stop words and punctuation
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

# Preprocess text
preprocessed_text = preprocess_text(text)

# Task: Cosine Similarity Calculation
def cosine_similarity(documents):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    # Calculate TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(documents)
    # Compute cosine similarity matrix
    similarity_matrix = np.dot(tfidf_matrix, tfidf_matrix.T)
    return similarity_matrix

# Standard Python implementation
start_time = time.time()
similarity_py = cosine_similarity([preprocessed_text])
print("Standard Python Cosine Similarity Calculation:")
print("Time taken:", time.time() - start_time)

# Numpy implementation
def calculate_tfidf_numpy(documents):
    # Tokenize documents
    tokens = [word_tokenize(doc) for doc in documents]
    # Create vocabulary
    vocabulary = set()
    for token_list in tokens:
        vocabulary.update(token_list)
    vocabulary = sorted(vocabulary)
    # Compute term frequency
    tf_matrix = np.zeros((len(documents), len(vocabulary)))
    for i, token_list in enumerate(tokens):
        for token in token_list:
            if token in vocabulary:
                tf_matrix[i, vocabulary.index(token)] += 1
    # Compute inverse document frequency
    df = np.sum(tf_matrix > 0, axis=0)
    idf = np.log(tf_matrix.shape[0] / df)
    # Compute TF-IDF matrix
    tfidf_matrix = tf_matrix * idf
    # Normalize TF-IDF matrix
    tfidf_norm = np.linalg.norm(tfidf_matrix, axis=1, keepdims=True)
    tfidf_matrix /= tfidf_norm
    return tfidf_matrix

start_time = time.time()
tfidf_matrix_np = calculate_tfidf_numpy([preprocessed_text])
similarity_np = np.dot(tfidf_matrix_np, tfidf_matrix_np.T)
print("Optimized Numpy Cosine Similarity Calculation:")
print("Time taken:", time.time() - start_time)

# Modified cosine similarity function with memoization
@lru_cache(maxsize=None)
def cosine_similarity_memoized(documents):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    similarity_matrix = np.dot(tfidf_matrix, tfidf_matrix.T).toarray()
    return similarity_matrix

# Test and measure the performance of the memoized version
start_time = time.time()
similarity_memoized = cosine_similarity_memoized(tuple([preprocessed_text]))  # Convert list to tuple for hashability
print("Memoized Cosine Similarity Calculation:")
print("Time taken:", time.time() - start_time)

# Preprocess text outside parallelized function
preprocessed_text = preprocess_text(text)

# Function for cosine similarity calculation without vectorization
from scipy.sparse import vstack

def calculate_tfidf_matrix(text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text])
    tfidf_matrix_dense = tfidf_matrix.toarray()  # Convert sparse matrix to dense array
    return tfidf_matrix_dense.reshape(1, -1)  # Reshape to ensure 2-D array

# Joblib parallelization for TF-IDF matrix calculation
start_time = time.time()
tfidf_matrices = Parallel(n_jobs=-1)(delayed(calculate_tfidf_matrix)(preprocessed_text) for _ in range(10))

# Combine TF-IDF matrices
combined_tfidf_matrix = np.concatenate(tfidf_matrices)

# Reshape the combined matrix to 2-D
combined_tfidf_matrix = combined_tfidf_matrix.reshape(len(tfidf_matrices), -1)

# Compute cosine similarity matrix
similarity_matrix_joblib = np.dot(combined_tfidf_matrix, combined_tfidf_matrix.T)
print("Joblib Optimized Cosine Similarity Calculation:")
print("Time taken:", time.time() - start_time)




Standard Python Cosine Similarity Calculation:
Time taken: 0.0021979808807373047
Optimized Numpy Cosine Similarity Calculation:
Time taken: 0.001977205276489258
Memoized Cosine Similarity Calculation:
Time taken: 0.0020465850830078125
Joblib Optimized Cosine Similarity Calculation:
Time taken: 0.011260271072387695


[nltk_data] Downloading package punkt to /home/jas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  tfidf_matrix /= tfidf_norm
