In [1]:
#Read in the extracted brown files
import glob

tagged_files = glob.glob("_extracted_brown/*.txt") #Read in the files and creates a list
print(type(tagged_files))
print(len(tagged_files)) #Should be 500

<class 'list'>
500


In [2]:
'''
Make the files into a list of a list of tuples
The tuple contains a str(word) and a set(tag(s)) 
Tag(s) because some words in the file contain more than one tag
'''
#I got help from the website where we download the extarcted brown text files
#https://kristopherkyle.github.io/Corpus-Linguistics-Working-Group/pos_tagging_1.html

#divide into sentences
full_data: list = []
for file in tagged_files:
    with open(file, 'r') as x:
        text = x.read().split("\n\n")
        for sent in text:
            sentence = []
            for word_line in sent.split("\n"):
                #Strip leading/trailing whitespace
                word_line = word_line.strip()
                
                #Skip empty lines
                if not word_line:
                    continue
                    
                # Check if split will work
                parts = word_line.split(" ", 1)
                if len(parts) != 2:
                    continue
                
                #Continue getting the word and tag(s)
                word_, pos = parts
                pos_set:set = set(pos.split("|"))
                sentence.append((word_, pos_set))
            
            if sentence:
                full_data.append(sentence)

In [3]:
#Better Sanity Check so I can see the structure
print(f"full_data type: {type(full_data)}")
print(f"Number of sentences: {len(full_data)}")

if full_data:
    first_sentence = full_data[0]
    print(f"First sentence type: {type(first_sentence)}")
    print(f"First sentence length: {len(first_sentence)}")
    
    if first_sentence:
        first_item = first_sentence[0]
        print(f"First item type: {type(first_item)}")
        print(f"First item: {first_item}")

full_data type: <class 'list'>
Number of sentences: 52108
First sentence type: <class 'list'>
First sentence length: 17
First item type: <class 'tuple'>
First item: ('In', {'IN'})


In [None]:
#HMM Model
import numpy as np
class HiddenMarkovModel:
    def __init__(self):
        #Initialize everything when I first create the Hidden Markov Model
        self.states = None
        self.observations = None
        
        #I need these states/observations to index
        #Because I need a way to calculate the probs (numpy understands integer indices, NOT strings!!!)
        self.states_to_idx = None
        self.states_to_idx = None
        
        #Make empty initial/tranmission/emission probabilities 
        #Since it's all learned during training
        self.initial_probs = None
        self.transition_probs = None
        self.emission_probs = None
        
    def train_HMM(self, training_data: list):
        """
        Trains the HMM on tagged data
        Calculates the initial, transmission, and emission probabilities
        Args:
            training_data (list): a list of a list of tuples with the words and POS tags
        """
        #Build the states and observations from the training data
        #Make them sets, since they don't allow duplication
        all_states = set()
        all_observations = set()
        for sentence in training_data:
            for word,tags in sentence:
                #Observations are based on the words
                all_observations.add(word)
                #The states are the tags
                all_states.update(tags)
        
        #Make the states and observations into lists
        self.states = list(all_states)
        self.observations = list(all_observations)
        
        #Make my state/observation index
        self.state_to_idx: dict = {state: i for i, state in enumerate(self.states)}
        self.obs_to_idx: dict = {obs: i for i, obs in enumerate(self.observations)}
        
        #initialize the empty matrices
        n_states = len(self.states)
        n_observations = len(self.observations)
        self.initial_probs = np.zeros(n_states)
        self.transition_probs = np.zeros((n_states, n_states))
        self.emission_probs = np.zeros((n_states, n_observations))
        
        #Now calculate the all the probabilities
        self.calculate_initial_probabilities(training_data)
        self.calculate_transition_probabilities(training_data)
        self.calculate_emission_probabilities(training_data)
        
        #DEBUGGING TO SEE IF IT WORKS PROPERLY
        #print("Sample transition probabilities:")
        #print(f"DT -> NN: {self.transition_probs[self.state_to_idx['DT']][self.state_to_idx['NN']]}")
        #print(f"NN -> VB: {self.transition_probs[self.state_to_idx['NN']][self.state_to_idx['VB']]}")

        #print("\nSample emission probabilities:")
        #print(f"P('The'|'DT'): {self.emission_probs[self.state_to_idx['DT']][self.obs_to_idx['The']]}")
        #print(f"P('cat'|'NN'): {self.emission_probs[self.state_to_idx['NN']][self.obs_to_idx['cat']]}")
        
    def calculate_initial_probabilities(self,training_data: list) -> np.ndarray:
        """
        Calculate the intial state probabilities P(tag|start)
        Args:
            training_data (list): a list of a list of tuples with the words and POS tags
        """
        for sentence in training_data:
            #Check to see if the sentence is empty
            if sentence:
                #Get the first words and tag(s) in the sentence
                first_word,first_tags = sentence[0]
                #Handle if the word has multiple tags
                for tag in first_tags:
                    #If the tag is in the state indec dictionary
                    if tag in self.state_to_idx:
                        tag_idx = self.state_to_idx[tag] #Forgot to add this and it lead to an error
                        #Fractional count if there's multiple tags
                        self.initial_probs[tag_idx] = self.initial_probs[tag_idx] + 1 / (len(first_tags))
    
    def calculate_transition_probabilities(self, training_data:list) -> np.ndarray:
        """
        Create the transition probability of current tag and previous tag
        P(tag i | tag i-1)
        Args:
            training_data (list): a list of a list of tuples with the words and POS tags
        """
        #Create a temporary matrix that will do all the calculations
        #Then store that into the self.transition_probability matrix
        transition_counts = np.zeros((len(self.states), len(self.states)))
        
        for sentence in training_data:
            #i in range of the entire sentence
            for i in range(1, len(sentence)):
                #Previous word and tags
                prev_word, prev_tags = sentence[i-1]
                #Current word and current tags
                current_word, current_tags = sentence[i]
                for previous_tag in prev_tags:
                    for current_tag in current_tags:
                        #If both the previous tag and the current tag are in the state index dicitonary
                        if previous_tag in self.state_to_idx and current_tag in self.state_to_idx:
                            prev_idx = self.state_to_idx[previous_tag]
                            curr_idx = self.state_to_idx[current_tag]
                            #Accidentally used + instead of *
                            transition_counts[prev_idx][curr_idx] +=  1 / (len(prev_tags) * len(current_tags))
                            
        #I need to normalize the transition matrix so it's between 0-1
        row_sums = transition_counts.sum(axis=1, keepdims=True)
        self.transition_probs = np.divide(transition_counts, row_sums, 
                                    out=np.zeros_like(transition_counts), 
                                    where=row_sums!=0)
    
    def calculate_emission_probabilities(self, training_data:list) -> np.ndarray:
        """
        Create the emission probability of the word and tag
        P(word | tag)
        Args:
            training_data (list): 
        """
        #Need a temporary matrix that does all the calculations
        #Then put it into the emission porbability matrix
        emission_counts = np.zeros((len(self.states), len(self.observations)))
        
        for sentence in training_data:
            for word, tags in sentence:
                if word in self.obs_to_idx:
                    word_idx = self.obs_to_idx[word]
                    for tag in tags:
                        if tag in self.state_to_idx:
                            tag_idx = self.state_to_idx[tag]
                            emission_counts[tag_idx][word_idx] += 1 / len(tags)
            
        #Normalize the counts into probabilities (I forgot this, which caused an issue in the code (It was more than 1))
        row_sums = emission_counts.sum(axis=1, keepdims=True)
        self.emission_probs = np.divide(emission_counts, row_sums,
                                    out=np.zeros_like(emission_counts),
                                    where=row_sums!=0)
        
    def viterbi(self, sentence: list) -> np.ndarray:
        """ My implementation of the viterbi algorithm from the textbook
        It returns the best path from the end of the sentence to the beginning
        Args:
            Sentence (list): a list of words
        """
        #Debug to see how the input is
        print(f"Input sentence: {sentence}")
     
        #Intialize the viterbi matrix and the bacpointer matrix
        viterbi = np.zeros((len(sentence), len(self.states)))
        backpointer = np.empty((len(sentence), len(self.states)))
       
        #for each state s from 1 to s
        first_word = sentence[0]
        for state_idx in range(len(self.states)):
            #make a viterbi matrix where viterbi[s][1] <- init_prob of that state * emission[state][observation[0]]
            #This is if the word is known
            if first_word in self.obs_to_idx:
                word_idx = self.obs_to_idx[first_word]
                #viterbi[first word][state] = initial prob of that state * emission[first word in the sentence]
                viterbi[0][state_idx] = self.initial_probs[state_idx] * self.emission_probs[state_idx][word_idx]
            
            #I need a way to handle unknown words
            else:
                #If the word is not known, make it 0
                viterbi[0][state_idx] = 0
            
            #Backpointer for the first word. There's no previous word so make it something to denote that
            backpointer[0][state_idx] = -1
            
            #Debugging statement to see what the initial viterbi row looks like
            #print(f"Initial viterbi row: {viterbi[0]}")
            
        #Going through my sentence (after the first word)
        for t in range(1, len(sentence)):
            #Get the index of the current word
            current_word = sentence[t]
            #See if the current word's index exists
            current_word_idx = self.obs_to_idx.get(current_word)
            
            #Go through every state besides the first word
            for current_state in range(len(self.states)):
                #Need variables to find which previous states gives us the max probability
                max_prob = -1
                best_prev_state = -1
                #Need to go through the previous states
                for prev_state in range(len(self.states)):
                    #The probability of the viterbi[previous word][previous state] * transition probability matrix[previous state][current state] * emission probability matrix[current state][word index]
                    prob = viterbi[t-1][prev_state] * self.transition_probs[prev_state][current_state] * self.emission_probs[current_state][current_word_idx]
                    
                    if prob > max_prob:
                        max_prob = prob #make the current probability the new max probability
                        best_prev_state = prev_state #make the current previous state the best previous state
                
                #After checking all the previous states, store the max probability adn the best previous state
                #Into the viterbi and the backpointer prespectively        
                viterbi[t][current_state] = max_prob
                
                # Debug statement to see what viterbi looks like after each time step
                #print(f"Viterbi at time {t}: {viterbi[t]}")
                
                backpointer[t][current_state] = best_prev_state
                         
        #Backtracking now
        #Get the last word of the sentence
        last_word = len(sentence) - 1
        #Get the best state for the last word with the argmax of the viterbi matrix
        best_last_state = np.argmax(viterbi[last_word])
        #Make a best path array with type int
        bestpath = np.zeros(len(sentence), dtype=int)
        #Make the best path of the last word the best last state
        bestpath[last_word] = best_last_state
        #Start from the second to last word and end at the beginning of the sentence
        #n-2, n-3, ..., 0
        for t in range(len(sentence)-2, -1, -1):
            bestpath[t] = backpointer[t+1][bestpath[t+1]]
            
        #Return the best path and the best path's probability
        return bestpath
    
    def predict(self, sentence: list) -> list:
        """
        Predict the part-of-speech tags for each word in the sentence
        Args:
            sentence (list): a list of words the HMM predicts
        Returns:
            a list of tuples (word, and predicted tag)
        """
        #Use the viterbi function
        tag_indices = self.viterbi(sentence)
        
        #Convert indices to actual tag names
        predicted_tags = [self.states[idx] for idx in tag_indices]
        
        #Pair words with predicted tags
        return list(zip(sentence, predicted_tags))
        

In [5]:
#Send in my list to train the model
hmm = HiddenMarkovModel()
hmm.train_HMM(full_data)

In [6]:
#A sample test Set for the HMM
#A few short sentences
test_sentence1 = ["The", "cat", "sat"]
test_sentence2 = ["Mark", "will", "pay", "the", "bill", "soon"]
test_sentence3 = ["I", "know", "how", "watch", "after", "a", "dog"]
test_sentence4 = ["I", "am", "so", "tired", "."]

possible_unknown = ["The", "hidden", "markov", "model", "is", "working", "well", "."]
possible_unknown = ["Computer", "science", "is", "cool", "but", "very", "hard", "."]
#A two long ones
test_sentence_long = ["The", "police", "department", "said", "that", "the", "suspect", "has", "been", "apprehended", "today", ",", "they", "hope", "justice", "will", "be", "served", "."]
test_sentence_long2 = ["Today", "the", "studio", "announced", "that", "the", "new", "film", "will", "be", "about", "a", "girl", "who", "is", "transported", "to", "another", "world", "."]

predicted_tags1 = hmm.predict(test_sentence1)
print("HMM prediction of first sentence: ", predicted_tags1)
#Originally: HMM prediction of first sentence:  [('The', 'NPS'), ('cat', 'NPS'), ('sat', 'NPS')] - predicted it as NPs for some reason (Error with probability matrices)
#Fixed it issue: HMM prediction of first sentence:  [('The', 'DT'), ('cat', 'NN'), ('sat', 'VBD')]

predicted_tags2 = hmm.predict(test_sentence2)
print("HMM prediction of second sentence: ", predicted_tags2)
#HMM prediction of second sentence:  [('Mark', 'NNP'), ('will', 'MD'), ('pay', 'VB'), ('the', 'DT'), ('bill', 'NN'), ('soon', 'RB')]

predicted_tags3 = hmm.predict(test_sentence3)
print("HMM prediction of third sentence: ", predicted_tags3)
#HMM prediction of third sentence:  [('I', 'PRP'), ('know', 'VBP'), ('how', 'WRB'), ('watch', 'NN'), ('after', 'IN'), ('a', 'DT'), ('dog', 'NN')]

predicted_tags4 = hmm.predict(test_sentence4)
print("HMM prediction of fourth sentence: ", predicted_tags4)
#HMM prediction of fourth sentence:  [('I', 'PRP'), ('am', 'VBP'), ('so', 'RB'), ('tired', 'VBN'), ('.', '.')]

predicted_long_tags1 = hmm.predict(test_sentence_long)
print("HMM prediction of first long sentence: ", predicted_long_tags1)
#HMM prediction of first long sentence:  [('The', 'DT'), ('police', 'NN'), ('department', 'NN'), ('said', 'VBD'), ('that', 'IN'), ('the', 'DT'), ('suspect', 'NN'), ('has', 'VBZ'), ('been', 'VBN'), ('apprehended', 'VBN'), ('today', 'RB'), (',', ','), ('they', 'PRP'), ('hope', 'VBP'), ('justice', 'NN'), ('will', 'MD'), ('be', 'VB'), ('served', 'VBN'), ('.', '.')]

predicted_long_tags2 = hmm.predict(test_sentence_long2)
print("HMM prediction of second long sentence: ", predicted_long_tags2)
#HMM prediction of second long sentence:  [('Today', 'RB'), ('the', 'DT'), ('studio', 'NN'), ('announced', 'VBD'), ('that', 'IN'), ('the', 'DT'), ('new', 'JJ'), ('film', 'NN'), ('will', 'MD'), ('be', 'VB'), ('about', 'IN'), ('a', 'DT'), ('girl', 'NN'), ('who', 'WP'), ('is', 'VBZ'), ('transported', 'VBN'), ('to', 'TO'), ('another', 'DT'), ('world', 'NN'), ('.', '.')]

Input sentence: ['The', 'cat', 'sat']
HMM prediction of first sentence:  [('The', 'DT'), ('cat', 'NN'), ('sat', 'VBD')]
Input sentence: ['Mark', 'will', 'pay', 'the', 'bill', 'soon']
HMM prediction of second sentence:  [('Mark', 'NNP'), ('will', 'MD'), ('pay', 'VB'), ('the', 'DT'), ('bill', 'NN'), ('soon', 'RB')]
Input sentence: ['I', 'know', 'how', 'watch', 'after', 'a', 'dog']
HMM prediction of third sentence:  [('I', 'PRP'), ('know', 'VBP'), ('how', 'WRB'), ('watch', 'NN'), ('after', 'IN'), ('a', 'DT'), ('dog', 'NN')]
Input sentence: ['I', 'am', 'so', 'tired', '.']
HMM prediction of fourth sentence:  [('I', 'PRP'), ('am', 'VBP'), ('so', 'RB'), ('tired', 'VBN'), ('.', '.')]
Input sentence: ['The', 'police', 'department', 'said', 'that', 'the', 'suspect', 'has', 'been', 'apprehended', 'today', ',', 'they', 'hope', 'justice', 'will', 'be', 'served', '.']
HMM prediction of first long sentence:  [('The', 'DT'), ('police', 'NN'), ('department', 'NN'), ('said', 'VBD'), ('that', 'IN'), ('th

In [7]:
#Feature engineering for the extracted brown files
#I got help from the geeks2geeks website
#https://www.geeksforgeeks.org/nlp/conditional-random-fields-crfs-for-pos-tagging-in-nlp/
def word_features(sentence, i):
    word = sentence[i][0]
    features = {
        'word': word,
        'is_first': i == 0, #if the word is a first word
        'is_last': i == len(sentence) - 1,  #if the word is a last word
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,      #word is in uppercase
        'is_all_lower': word.lower() == word,      #word is in lowercase
         #prefix of the word
        'prefix-1': word[0],   
        'prefix-2': word[:2],
        'prefix-3': word[:3],
         #suffix of the word
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
         #extracting previous word
        'prev_word': '' if i == 0 else sentence[i-1][0],
         #extracting next word
        'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
        'has_hyphen': '-' in word,    #if word has hypen
        'is_numeric': word.isdigit(),  #if word is in numeric
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

In [None]:
X = []
y = []
for sentence in full_data:
    X_sentence = []
    y_sentence = []
    #Go through every sentence in the full data list
    for i in range(len(sentence)):
        #Append the word features into the X_sentence
        #print(f"Sentence[i][0]: {sentence[i][0]}") 
        X_sentence.append(word_features(sentence,i))
        #print(f"Sentence[i][1] is: {sentence[i][1]}")
        y_sentence.append(sentence[i][1])
        
    #Append the sentences into the original list
    X.append(X_sentence)
    y.append(y_sentence)
    
#Split the extracted files (80% training, 20% testing)
split = int(0.8 * len(X))
#Get every word,tag up to 80% of the orignal X and y
X_train = X[:split]
y_train = y[:split]
#Get the remaining 20% of the original X and y
X_test = X[split:]
y_test = y[split:]

In [None]:
#CRF
import numpy as np
import math
from collections import defaultdict

#Need to modify to handle more than one tag
class ConditionalRandomField():
    '''
    My implementation of linear-chain Conditional Random Field
    '''
    def __init__(self):
        
        self.weights = defaultdict(float) #Use the defaultdict to make a dictionary fo floats for the weights
        self.labels = set() #A set for all the possible POS tags
        
    def feature_extraction(self, feature_dict, current_tags, prev_tags=None):
        '''Extract all the features for the states and transitions'''
        features = []
        
        #Get the state features for the current tag
        for current_tag in current_tags:
            for key,value in feature_dict.items():
                #Hanld if the value is a boolean item
                if isinstance(value, bool):
                    if value:
                        features.append(f"{key}=True{current_tag}")
                #Handle if the value is a string
                elif isinstance(value, str):
                    #Skip empty strings
                    if value:
                        features.append(f"{key}={value}_{current_tag}")
                #If it's not a boolean value or a string, append it as what is
                else:
                    features.append(f"{key}={value}_{current_tag}")
                    
        
        #get the transition features (previous tag -> current tag)
        if prev_tags is not None:
            for prev_tag in prev_tags:
                for current_tag in current_tags:
                    features.append(f"trans_{prev_tag}_{current_tag}")
            
        return features
    
    def compute_score(self, features:list):
        return sum(self.weights[f] for f in features)
    
    def compute_log_likelihood(self, sentence_features:list, true_tags, Z:float):
        """Compute log-likelihood for a sentence
        """
        log_likelihood = 0.0
        
        # Add state feature scores
        for i, true_label in enumerate(true_tags):
            features = self.feature_extraction(sentence_features[i], true_label)
            log_likelihood += self.compute_score(features)
        
        # Add transition feature scores
        for i in range(1, len(true_tags)):
            prev_label = true_tags[i-1]
            current_label = true_tags[i]
            trans_feat = f"trans_{prev_label}_{current_label}"
            log_likelihood += self.weights[trans_feat]
        
        # Subtract log partition function
        log_likelihood -= math.log(Z) if Z > 0 else 0
        
        return log_likelihood
        
    def train(self,X_train:list, y_train:list, learning_rate:float=0.01,iterations:int=10):
        """
        Train the linear-chain CRF using the X_train and the y_train
        Args:
            X_train (list): a list of features
            y_train (list): a list of words that are associated with the feature
            learning_rate (float, optional): _description_. Defaults to 0.01.
            iterations (int, optional): _description_. Defaults to 10.
        """
        #Collect all the labels in the y_train
        for sentence_tags in y_train:
            for tag_set in sentence_tags:
                for tag in tag_set:
                    self.labels.add(tag)
                
        #Debugging statement to see if tags were added        
        print(f"Training with {len(self.labels)} labels individual labels: {list(self.labels)}")
        
        #Convert the y-train to a sequence of possible tags
        
        #had an error here because it couldn't iterate over an integer
        for iteration in range(iterations):
            total_loss = 0.0
            #Make it so the model can see the sentence feature and the tag for that feature
            for sentence_features, actual_tags in zip(X_train, y_train):
                n = len(sentence_features)
                
                #Forward-backward algorithm using the features in X_train
                alpha = self.forward_algorithm(sentence_features)
                beta = self.backward_algorithm(sentence_features)
                
                #Compute the partition function
                Z = sum(alpha[n-1].values())
                
                #Compute the expected and actual feature counts
                expected_feature_counts = defaultdict(float)
                actual_feature_counts = defaultdict(float)
                
                #State feature extraction
                for i in range(n):
                    for label in self.labels:
                        prob = (alpha[i][label] * beta[i][label]) / Z
                        features = self.feature_extraction(sentence_features[i], label)
                        for feat in features:
                            expected_feature_counts[feat] += prob             
                #Transition feature extraction
                for i in range(1,n):
                    for prev_label in self.labels:
                        for current_label in self.labels:
                            prob = (alpha[i-1][prev_label] * math.exp(self.compute_score(self.feature_extraction(sentence_features[i], current_label)))
                                    * math.exp(self.compute_score([f"trans_{prev_label}_{current_label}"])) * beta[i][current_label]) / Z
                            expected_feature_counts[f"trans_{prev_label}_{current_label}"] += prob
                            
                #Actual counts from the features
                for i, true_label in enumerate(actual_tags):
                    features = self.feature_extraction(sentence_features[i], true_label)
                    for feat in features:
                        actual_feature_counts[feat] += 1.0
                    if i > 0:
                        prev_label = true_label[i-1]
                        actual_feature_counts[f"trans_{prev_label}_{current_label}"] += 1.0
                
                #Update the weights
                all_features = set(expected_feature_counts.keys()) | set(actual_feature_counts.keys())
                for feat in all_features:
                    #Gradient = actual feature - expected feature
                    gradient = actual_feature_counts[feat] - expected_feature_counts[feat]
                    
                    #Update the weights of that feature with the learning rate and gradient
                    self.weights[feat] += learning_rate * gradient
                #Calculate the log likelihood for this sentence
                log_likelihood = self.compute_log_likelihood(sentence_features, actual_tags, Z)
                
                total_loss -= log_likelihood
            print(f"Iteration {iteration + 1}, Loss: {total_loss}")
    
    def forward_algorithm(self, sentence_features, possible_tag_sequence):
        n = len(sentence_features)
        alpha = [defaultdict(float) for _ in range(n)]
        
        #Initialize the first position in the sentence
        for tag in possible_tag_sequence:
            features = self.feature_extraction(sentence_features[0], {tag})
            alpha[0][frozenset({tag})] = math.exp(self.compute_score(features))
        
        #Forward pass
        #Go through every other word in the sentence (1-n)
        for i in range(1,n):
            #Look at the current tag
            for current_tag in possible_tag_sequence[i]:
                current_tag_set = frozenset({current_tag})
                total = 0.0
                #get the current features and the score of that state
                current_features = self.feature_extraction(sentence_features[i], {current_tag})
                state_score = math.exp(self.compute_score(current_features))
                
                #Look at the previous labels
                #Get the transition features and the score
                for prev_tag in possible_tag_sequence[i-1]:
                    prev_tag_set = frozenset({prev_tag})
                    transition_features = [f"trans_{prev_tag}_{current_tag}"]
                    transition_score = math.exp(self.compute_score(transition_features))
                    
                    total += alpha[i-1][prev_tag_set] * state_score * transition_score
                    
                alpha[i][current_tag_set] = total
                
        return alpha
    
    def backward_algorithm(self, sentence_features, possible_tag_set):
        n = len(sentence_features)
        beta = [defaultdict(float) for _ in range(n)]
        
        #Initialize the last position in the sentence
        for label in self.labels:
            beta[n-1][label] = 1.0
            
        #Backward pass
        #Go all the way back to the first word in the sentence
        for i in range(n-2, -1, -1):
            for current_label in self.labels:
                total = 0.0
                #For the next label in the sentence
                for next_label in self.labels:
                    next_features = self.feature_extraction(sentence_features[i+1], next_label)
                    state_score = math.exp(self.compute_score(next_features))
                    transition_features = [f"trans_{current_label}_{next_label}"]
                    transition_score = math.exp(self.compute_score(transition_features))
                    total += beta[i+1][next_label] * state_score * transition_score
                
                beta[i][current_label] = total
        return beta
    def crf_viterbi(self, sentence_features):
        """
        This is the viterbi algorithm for the linear-chain CRF
        """
        n = len(sentence_features)
        #viterbi matrix for the algorithm
        dp = [defaultdict(float) for _ in range(n)]
        #This is the backpointer matrix for the viterbi algorithm
        backpointer = [defaultdict(str) for _ in range(n)]
        
        #Initialize the first position like in the HMM
        for label in self.labels:
            #Get the features for the first word in the sentence and the label
            features = self.feature_extraction(sentence_features[0], label)
            dp[0][label] = self.compute_score(features)
        
        #Now go through every word in the sentence
        #Fill in the DP table
        for i in range(1,n):
            for current_label in self.labels:
                #Set the best score and prev label to infinity and none
                #because they're not known yet
                best_score = -float('inf')
                best_prev_label = None
                
                current_features = self.feature_extraction(sentence_features[i], current_label)
                state_score = self.compute_score(current_features)
                
                for prev_label in self.labels:
                    trans_features = [f"trans_{prev_label}_{current_label}"]
                    transition_score = self.compute_score(trans_features)
                    #Calculate with the previous word and previous label, state score, and transition score
                    score = dp[i-1][prev_label] + state_score + transition_score
                    
                    #Same as the HMM, check to see if the score beats the best score
                    if score > best_score:
                        best_score = score #Replace the best score with the new score
                        best_prev_label = prev_label #Make the best_prev_label the current previous label
                
                #Put the best score and best_prev_label into the viterbi matrix and backpointer matrix respectively        
                dp[i][current_label] = best_score
                backpointer[i][current_label] = best_prev_label
            
        #Backtracking
        bestpath = []
        best_final_label = max(dp[n-1].items(), key= lambda x: x[1])[0]
        
        current_label = best_final_label
        for i in range(n-1, -1, -1):
            bestpath.append(current_label)
            if i > 0:
                current_label = backpointer[i][current_label]
        return bestpath[::1]
    
    def predict(self, X_test:list):
        """
        A prediction function that predicts the parts of speech for a given sentence
        Args:
            X_test (list): a list containing the words for a sentence

        Returns:
            list: a list that contains the viterbi path from the last word to the first word
        """
        prediction_result = []
        for sentence_features in X_test:
            pred_tags = self.crf_viterbi(sentence_features)
            prediction_result.append(pred_tags)
        return prediction_result

In [17]:
crf = ConditionalRandomField()
crf.train(X_train, y_train, learning_rate=0.1, iterations=10)

# Predict
y_pred = crf.predict(X_test)

# Evaluate (simple accuracy)
correct = 0
total = 0
for true_tags, pred_tags in zip(y_test, y_pred):
    for t, p in zip(true_tags, pred_tags):
        if t == p:
            correct += 1
        total += 1

accuracy = correct / total
print(f"Accuracy: {accuracy:.4f}")

Training with 48 labels


TypeError: 'set' object is not subscriptable

When I made my learning rate and iterattion the default, 0.01 and 10:
Accuracy = 

In [None]:
#Now I need to read in the GMB dataset
with open('GMB_dataset.txt', 'r') as gmb_file:
    #Need a way to read in the text file with columns
    line = gmb_file.readlines()
    sentence = gmb_file.strip()

In [None]:
#Feature engineering necessary for the CRF
#Same as the one for CRF 
def word_feature(sentence, i):
    word = sentence[i][0]
    features = {
        'word': word,
        'is_first': i == 0, #if the word is a first word
        'is_last': i == len(sentence) - 1,  #if the word is a last word
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,      #word is in uppercase
        'is_all_lower': word.lower() == word,      #word is in lowercase
         #prefix of the word
        'prefix-1': word[0],   
        'prefix-2': word[:2],
        'prefix-3': word[:3],
         #suffix of the word
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
         #extracting previous word
        'prev_word': '' if i == 0 else sentence[i-1][0],
         #extracting next word
        'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
        'has_hyphen': '-' in word,    #if word has hypen
        'is_numeric': word.isdigit(),  #if word is in numeric
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

In [None]:
#test sentences for the CRFs
CRF_test_sentence1 = ["She", "likes", "to", "read","books"]
CRF_test_sentence2 = []
CRF_test_sentence3 = []
CRF_test_sentence4 = []