In [2]:
import numpy as np 
import pandas as pd

In [3]:
# # Read training data from CSV file
train_data = pd.read_csv('train.csv')

tagged_sentences = train_data['tagged_sentence'].apply(eval).tolist() #Loading tagged sentence and making a list of list

def get_states_obs(data):
    observations = set()
    states = set()
    # count=0
    for row in tagged_sentences:
        for word, tag in row:
            # count+=1                                      # Making a list of unique observations and states
            observations.add(word)
            states.add(tag)

    # Convert sets to lists for consistency
    observations = list(observations)
    states = list(states)
    # print(count)
    return observations, states


obs, states=get_states_obs(train_data)

# Making a feature list for different possible suffixes in a word
suffixes={'ed': 49, 'ing':50 ,'ly' : 51, 'tion' : 52, 'er': 53, 'or': 54, 'sion': 55, 'ful':56, 'less': 57, 'ment': 58, 'able': 59, 'ible': 60, 'ous':61, 'est':62, 'ify':63, 'ize': 64 }


feature_matrix=[]

def extract_suffix(word, suffixes):
    if(len(word)>=5):
        if(word[-2:] in suffixes):
            # print("last2")
            return suffixes[word[-2:]]                  # Function to extract suffix from word
    if(len(word)>=6):
        if(word[-3:] in suffixes):
            # print("last3")
            return suffixes[word[-3:]]
    if(len(word)>=7):
        if(word[-4:] in suffixes):
            # print("last4")
            return suffixes[word[-4:]]
    # print("no_suffix")
    return 0


# feature vector has first 0-48 features as distinct tags, 49-64 as suffix, 65 as first word, 66 as last word
# Making features for the dataset
overall_feature=[]
for row in tagged_sentences:
    for i in range(len(row)):
        feature_vector=np.zeros((67))
        word=row[i][0]
        if(i==0):
            feature_vector[65]=1
        else:
            prev_tag=row[i-1][1]
            feature_vector[states.index(prev_tag)]=1
        if(i+1==len(row)):
            feature_vector[66]=1
        suffix_index=extract_suffix(word, suffixes)
        if(suffix_index!=0):
            feature_vector[suffix_index]=1
        overall_feature.append(feature_vector)

overall_feature=np.array(overall_feature)       # Training set after converting it to features

In [4]:
# Making the model for multinomial logistic regression

class MaximumEntropyMarkovModel:
    def __init__(self, learning_rate=0.01, num_iterations=1, batch_size=32):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size=batch_size

    def softmax(self, z):
        z=np.array(z)
        return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)

    def fit(self, X, y):
        np.random.seed(42)  # Set seed for reproducibility
        self.weights = np.random.randn(X.shape[1], y.shape[1])
        self.bias = np.random.randn(1, y.shape[1])
        for i in range(self.num_iterations):
            for j in range(0, X.shape[0], self.batch_size):
                # print('batch',j)
                X_batch = X[j:j+self.batch_size]
                y_batch = y[j:j+self.batch_size]
                linear_model = (np.dot( X_batch, self.weights))
                y_predicted = self.softmax(linear_model)
        
                dw = (1 / X_batch.shape[0]) * np.dot(X_batch.T, (y_predicted - y_batch))
                db = (1 / X_batch.shape[0]) * np.sum(y_predicted - y_batch)
        
                self.weights -= self.learning_rate * dw
                self.bias -= self.learning_rate * db
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.softmax(linear_model)
        return np.argmax(y_predicted, axis=1)



In [5]:
pos_labels=[]
for row in tagged_sentences:
    for word, tag in row:
        pos_labels.append(states.index(tag))


pos_labels=np.array(pos_labels)
# print(pos_labels.shape)

from sklearn.preprocessing import OneHotEncoder

# One hot encoding the target labels
one_hot = OneHotEncoder(sparse=False)
one_hot_output = one_hot.fit_transform(pos_labels.reshape(-1, 1))

# print(one_hot_output.shape)
# # # Create and train the model
# # model = MultinomialLogisticRegression(learning_rate=0.01, num_iterations=10000)
# # model.fit(X, y)

# # # Predict the class labels of the same data (for demonstration purposes)
# # y_pred = model.predict(X)
# # print(y_pred)
# # print("lund lelo")




In [6]:
X=overall_feature
y=one_hot_output

# print(y.shape)
# Training the Model
model = MaximumEntropyMarkovModel(learning_rate=0.01, num_iterations=1000)
model.fit(X, y)


# sample_test=['For', 'you', 'have', 'been', 'reborn', ',', 'not', 'from', 'corruptible', 'seed', 'but', 'from', 'incorruptible', ',', 'through', 'the', 'word', 'of', 'God', '.']

In [7]:
output=[]
# Function for making features for test dataset
def make_feature_sentence(sentence, suffixes, word_index, prev_tag):
    feature_vector=np.zeros(67)
    suffix_index=extract_suffix(sentence[word_index], suffixes)
    if(suffix_index!=0):
        feature_vector[suffix_index]=1
    if(word_index==0):
        feature_vector[65]=1
    else:
        feature_vector[states.index(prev_tag)]=1
    if(word_index+1==len(sentence)):
        feature_vector[66]=1
    return feature_vector



In [8]:
test_files=pd.read_csv(r"C:\Users\2828a\OneDrive\Desktop\ELL884 ASS1\test_small.csv")

test_data=test_files['untagged_sentence'].apply(eval).tolist()

# print(test_data[0])
total_output=[]
for sentence in test_data:
    sent_output=[]
    for i in range(len(sentence)):
        if(i!=0):
            feature=make_feature_sentence(sentence, suffixes, i, sent_output[i-1][1])
        else:
            feature=make_feature_sentence(sentence,suffixes, i, 0)
        y_pred = model.predict(feature)
        sent_output.append((sentence[i], states[y_pred[0]]))
    total_output.append(sent_output)


In [10]:
ids = np.array(test_files['id'].to_list(), dtype="object")

def save_tagged_sentences_to_csv(tagged_sentences, ids, filename):
    df = pd.DataFrame({'id': ids, 'tagged_sentence': tagged_sentences})

    # Save the DataFrame to a CSV file
    df.to_csv(filename,index=False)

output_path=r'C:\Users\2828a\OneDrive\Desktop\ELL884 ASS1\output_memm.csv'
save_tagged_sentences_to_csv(total_output, ids, output_path)
df = pd.read_csv(output_path)
# print(df)