In [1]:
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Tokenization
text = "She is a good dancer."
tokens = word_tokenize(text)
print("Tokens:", tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Tokens: ['She', 'is', 'a', 'good', 'dancer', '.']


In [2]:
import nltk
from nltk.tokenize import sent_tokenize

# Sentence Segmentation
paragraph = "She is a good dancer. He plays the guitar well. They enjoy singing together."
sentences = sent_tokenize(paragraph)
print("Sentences:", sentences)


Sentences: ['She is a good dancer.', 'He plays the guitar well.', 'They enjoy singing together.']


In [3]:
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))

[nltk_data] Downloading package wordnet to /root/nltk_data...


rocks : rock
corpora : corpus


In [4]:
import re

# Updated regular expressions:

# Regular expression for all alphabetic strings (letters only)
regex_a = r'\b[a-zA-Z]+\b'

# Regular expression for lowercase alphabetic strings ending in a 'b'
regex_b = r'\b[a-z]+b\b'

# Regular expression for strings with three consecutive repeated words
regex_c = r'\b(\w+) \1 \1\b'

# Regular expression for strings with each 'a' immediately preceded and followed by a 'b'
regex_d = r'\b\w?ab+\w?\b'

# Regular expression for strings starting with an integer and ending with a word
regex_e = r'^\d+\s.*\b\w+$'

# Regular expression for strings containing both 'grotto' and 'raven'
regex_f = r'\bgrotto\b.*\braven\b|\braven\b.*\bgrotto\b'

# Regular expression for the first word of an English sentence
regex_g = r'^[A-Z][^.!?]*'
# Example text (from the provided draft)
text = """
Up to the 1980s, most natural language processing systems were based on complex sets of hand-written rules. Starting in the late 1980s, however, there was a revolution in natural language processing with the introduction of machine learning algorithms for language processing..
"""

# Function to find matches using a given regex pattern
def find_matches(pattern, text):
    return re.findall(pattern, text, flags=re.MULTILINE | re.IGNORECASE | re.DOTALL)

    # Test the regular expressions on the example text
matches_a = find_matches(regex_a, text)
matches_b = find_matches(regex_b, text)
matches_c = find_matches(regex_c, text)
matches_d = find_matches(regex_d, text)
matches_e = find_matches(regex_e, text)
matches_f = find_matches(regex_f, text)
matches_g = find_matches(regex_g, text)

# Print the results
print("Matches for regex_a:")
print(matches_a)

print("\nMatches for regex_b:")
print(matches_b)

Matches for regex_a:
['Up', 'to', 'the', 'most', 'natural', 'language', 'processing', 'systems', 'were', 'based', 'on', 'complex', 'sets', 'of', 'hand', 'written', 'rules', 'Starting', 'in', 'the', 'late', 'however', 'there', 'was', 'a', 'revolution', 'in', 'natural', 'language', 'processing', 'with', 'the', 'introduction', 'of', 'machine', 'learning', 'algorithms', 'for', 'language', 'processing']

Matches for regex_b:
[]


In [5]:
!pip install tokenizers python-Levenshtein
from Levenshtein import distance

# Levenshtein Distance
def levenshtein_distance(str1, str2):
    # Create a matrix to store distances
    matrix = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]

    # Initialize the matrix
    for i in range(len(str1) + 1):
        matrix[i][0] = i
    for j in range(len(str2) + 1):
        matrix[0][j] = j

    # Fill in the matrix using dynamic programming
    for i in range(1, len(str1) + 1):
        for j in range(1, len(str2) + 1):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            matrix[i][j] = min(
                matrix[i - 1][j] + 1,
                matrix[i][j - 1] + 1,
                matrix[i - 1][j - 1] + cost
            )

    # Return the Levenshtein distance
    return matrix[len(str1)][len(str2)]

# Example Levenshtein Distance
str1 = "NaturalLanguageProcessing"
str2 = "NaturalLanguageUnderstanding"
print(f"Levenshtein Distance between '{str1}' and '{str2}': {levenshtein_distance(str1, str2)}")

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.23.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.23.0 (from python-Levenshtein)
  Downloading Levenshtein-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.23.0->python-Levenshtein)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.23.0 python-Levenshtein-0.23.0 rapidfuzz-3.5.2
Levenshtein Distance between 'NaturalLanguageProcessing' and 'NaturalLanguageUnderstanding': 9


In [7]:
import re
from collections import defaultdict

def get_stats(vocab):
	"""
	Given a vocabulary (dictionary mapping words to frequency counts), returns a
	dictionary of tuples representing the frequency count of pairs of characters
	in the vocabulary.
	"""
	pairs = defaultdict(int)
	for word, freq in vocab.items():
		symbols = word.split()
		for i in range(len(symbols)-1):
			pairs[symbols[i],symbols[i+1]] += freq
	return pairs

def merge_vocab(pair, v_in):
	"""
	Given a pair of characters and a vocabulary, returns a new vocabulary with the
	pair of characters merged together wherever they appear.
	"""
	v_out = {}
	bigram = re.escape(' '.join(pair))
	p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
	for word in v_in:
		w_out = p.sub(''.join(pair), word)
		v_out[w_out] = v_in[word]
	return v_out

def get_vocab(data):
	"""
	Given a list of strings, returns a dictionary of words mapping to their frequency
	count in the data.
	"""
	vocab = defaultdict(int)
	for line in data:
		for word in line.split():
			vocab[' '.join(list(word)) + ' </w>'] += 1
	return vocab

def byte_pair_encoding(data, n):
	"""
	Given a list of strings and an integer n, returns a list of n merged pairs
	of characters found in the vocabulary of the input data.
	"""
	vocab = get_vocab(data)
	for i in range(n):
		pairs = get_stats(vocab)
		best = max(pairs, key=pairs.get)
		vocab = merge_vocab(best, vocab)
	return vocab

# Example usage:
corpus = '''Tokenization is the process of breaking down
a sequence of text into smaller units called tokens,
which can be words, phrases, or even individual characters.
Tokenization is often the first step in natural languages processing tasks
such as text classification, named entity recognition, and sentiment analysis.
The resulting tokens are typically used as input to further processing steps,
such as vectorization, where the tokens are converted
into numerical representations for machine learning models to use.'''
data = corpus.split('.')

n = 230
bpe_pairs = byte_pair_encoding(data, n)
bpe_pairs


{'Tokenization</w>': 2,
 'is</w>': 2,
 'the</w>': 3,
 'process</w>': 1,
 'of</w>': 2,
 'breaking</w>': 1,
 'down</w>': 1,
 'a</w>': 1,
 'sequence</w>': 1,
 'text</w>': 2,
 'into</w>': 2,
 'smaller</w>': 1,
 'units</w>': 1,
 'called</w>': 1,
 'tokens,</w>': 1,
 'which</w>': 1,
 'can</w>': 1,
 'be</w>': 1,
 'words,</w>': 1,
 'phrases,</w>': 1,
 'or</w>': 1,
 'even</w>': 1,
 'individual</w>': 1,
 'characters</w>': 1,
 'often</w>': 1,
 'first</w>': 1,
 'step</w>': 1,
 'in</w>': 1,
 'natural</w>': 1,
 'languages</w>': 1,
 'processing</w>': 2,
 'tasks</w>': 1,
 'such</w>': 2,
 'as</w>': 3,
 'classification,</w>': 1,
 'named</w>': 1,
 'entity</w>': 1,
 'recognition,</w>': 1,
 'and</w>': 1,
 'sentiment</w>': 1,
 'analysis</w>': 1,
 'The</w>': 1,
 'resulting</w>': 1,
 'tokens</w>': 2,
 'are</w>': 2,
 'typically</w>': 1,
 'used</w>': 1,
 'input</w>': 1,
 'to</w>': 2,
 'further</w>': 1,
 'steps,</w>': 1,
 'vectorization,</w>': 1,
 'where</w>': 1,
 'converted</w>': 1,
 'numerical</w>': 1,
 'repres

In [15]:
!pip install nltk

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# Sample text
text = "Natural language processing (NLP) is a branch of artificial intelligence (AI) that enables computers to comprehend, generate, and manipulate human language."

# Tokenize the text into words
tokens = word_tokenize(text)

# Part of Speech Tagging
pos_tags = pos_tag(tokens)
print("Part of Speech Tags:", pos_tags)


Part of Speech Tags: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('branch', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('(', '('), ('AI', 'NNP'), (')', ')'), ('that', 'IN'), ('enables', 'VBZ'), ('computers', 'NNS'), ('to', 'TO'), ('comprehend', 'VB'), (',', ','), ('generate', 'NN'), (',', ','), ('and', 'CC'), ('manipulate', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.util import ngrams

# Sample text for the language model
text = "This is a sample text for building n-gram language models. We'll use NLTK for this demonstration."

# Tokenize the text into words
tokens = word_tokenize(text.lower())  # Convert text to lowercase for simplicity
token_count = len(tokens)

# Create unigrams, bigrams, and trigrams
unigrams = ngrams(tokens, 1)
bigrams = ngrams(tokens, 2)
trigrams = ngrams(tokens, 3)

# Frequency distribution of n-grams
unigram_freq = FreqDist(unigrams)
bigram_freq = FreqDist(bigrams)
trigram_freq = FreqDist(trigrams)

# Display results
print("Unigrams:")
print(unigram_freq.most_common())

print("\nBigrams:")
print(bigram_freq.most_common())

print("\nTrigrams:")
print(trigram_freq.most_common())


Unigrams:
[(('this',), 2), (('for',), 2), (('.',), 2), (('is',), 1), (('a',), 1), (('sample',), 1), (('text',), 1), (('building',), 1), (('n-gram',), 1), (('language',), 1), (('models',), 1), (('we',), 1), (("'ll",), 1), (('use',), 1), (('nltk',), 1), (('demonstration',), 1)]

Bigrams:
[(('this', 'is'), 1), (('is', 'a'), 1), (('a', 'sample'), 1), (('sample', 'text'), 1), (('text', 'for'), 1), (('for', 'building'), 1), (('building', 'n-gram'), 1), (('n-gram', 'language'), 1), (('language', 'models'), 1), (('models', '.'), 1), (('.', 'we'), 1), (('we', "'ll"), 1), (("'ll", 'use'), 1), (('use', 'nltk'), 1), (('nltk', 'for'), 1), (('for', 'this'), 1), (('this', 'demonstration'), 1), (('demonstration', '.'), 1)]

Trigrams:
[(('this', 'is', 'a'), 1), (('is', 'a', 'sample'), 1), (('a', 'sample', 'text'), 1), (('sample', 'text', 'for'), 1), (('text', 'for', 'building'), 1), (('for', 'building', 'n-gram'), 1), (('building', 'n-gram', 'language'), 1), (('n-gram', 'language', 'models'), 1), (('la

In [18]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
import math

# Sample text for the language model
text = "This is a sample text for building n-gram language models. We'll use NLTK for this demonstration."

# Tokenize the text into words
tokens = word_tokenize(text.lower())  # Convert text to lowercase for simplicity

# Create n-grams
n = 3  # You can change 'n' to calculate perplexity for different n-gram models
ngrams_list = list(ngrams(tokens, n))

# Calculate frequencies of n-grams
ngrams_freq = Counter(ngrams_list)

# Total number of n-grams
total_ngrams = len(ngrams_list)

# Calculate perplexity
perplexity = math.pow(2, -sum(math.log2(ngrams_freq[gram] / (total_ngrams - (n - 1))) for gram in ngrams_list) / total_ngrams)
print(f"Perplexity of {n}-gram model:", perplexity)


Perplexity of 3-gram model: 14.999999999999993
