## N-gram Models (without using any libraries)

In [46]:
class Ngram:
    def __init__(self, text, n=None):
        self.text = text
        self.n = n
        self.split_text = self.process()

    def process(self):
        text = self.text.lower()
        text = text.replace('.', ' ')
        text = text.replace(',', ' ')
        split_text = text.split()
        return split_text

    def ngram_context(self):
        ngram_list, context_list  = [], []
        vocab = set()
        n = self.n
        for i in range(0, len(self.split_text) - n + 1):
            ngram = self.split_text[i:i + n]
            context = self.split_text[i:i + n - 1]
            context_list.append(context)
            ngram_list.append(ngram)
        for i in self.split_text:
             vocab.update([i])
        vocab = list(vocab)
            
        return ngram_list, context_list, vocab

    def counts(self, ingram):
        ngramcount = 0
        contextcount = 0
        ngram_list = self.ngram_context()[0]
        context_list = self.ngram_context()[1]

        for i in ngram_list:
            if i == ingram:
                ngramcount += 1

        for i in context_list:
            if i == ingram[:-1]:
                contextcount += 1

        return ngramcount, contextcount

    def probability(self, ingram): #calculates the normal proability of the ngram
        ngram_list, context_list,vocab = self.ngram_context()
        ncount, ccount = self.counts(ingram)

        # Check for division by zero
        if ccount == 0:
            return 0.0

        prob = ncount / ccount
        return prob
    
    def probability_laplace(self, ingram): #calculates the laplace proability of the ngram
        ngram_list, context_list, vocab = self.ngram_context()
        ncount, ccount = self.counts(ingram)
        
        lprob = (ncount +1)/ (ccount + len(vocab))
        return lprob
    
    def probability_addk(self, ingram, k): #calculates the add-k proability of the ngram
        ngram_list, context_list, vocab = self.ngram_context()
        ncount, ccount = self.counts(ingram)
        
        kprob = (ncount + k)/ (ccount + k*len(vocab))
        return kprob
        

    def perplexity(self): #calculates perplexity using normal probability
        ngram_list = self.ngram_context()
        prob = 1
        for i in ngram_list:
            prob_i = self.probability(i)

            # Check if prob_i is 0.0, and if so, assign a small positive value (e.g., 1e-10)
            if prob_i == 0.0:
                prob_i = 1e-10

            prob = prob * prob_i

        # Check if prob is still 0.0 after the loop
        if prob == 0.0:
            return float('inf')  # Return infinity to indicate undefined perplexity
        else:
            perp = prob ** (-1 / self.n)
            return perp

In [47]:
text = "This is a test. It has some punctuation, this like commas and  this periods."
ngram = Ngram(text, n=2)
ngram_list, context_list, vocab = ngram.ngram_context()
print(ngram_list)
print(context_list)
print(vocab)
ngramcount  = ngram.counts([ 'this', 'is'])[0]
contextcount = ngram.counts([ 'this', 'is'])[1]
print(ngramcount) 
print(contextcount)
prob = ngram.probability(['this', 'is'])
lprob = ngram.probability_laplace(['this', 'is'])
kprob = ngram.probability_addk(['this', 'is'], 0.5)
print(prob)
print(lprob)
print(kprob)

[['this', 'is'], ['is', 'a'], ['a', 'test'], ['test', 'it'], ['it', 'has'], ['has', 'some'], ['some', 'punctuation'], ['punctuation', 'this'], ['this', 'like'], ['like', 'commas'], ['commas', 'and'], ['and', 'this'], ['this', 'periods']]
[['this'], ['is'], ['a'], ['test'], ['it'], ['has'], ['some'], ['punctuation'], ['this'], ['like'], ['commas'], ['and'], ['this']]
['some', 'like', 'commas', 'a', 'it', 'periods', 'is', 'test', 'has', 'punctuation', 'and', 'this']
1
3
0.3333333333333333
0.13333333333333333
0.16666666666666666


## Naive-Bayes (with laplace smoothing)

In [51]:
import math

# Sample text data (replace with your own dataset)
texts = ["Very Powerful", "the most fun film of the summer", "no surprises and very few laughs", "entirely predictable and lacks energy","just plain boring"]
print(texts)
labels = [1, 1, 0, 0,0]  # 1 for positive, 0 for negative

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.replace('.', '')  # Remove periods
    text = text.replace(',', '')  # Remove commas
    text = text.replace('!', '')  # Remove exclamation marks
    text = text.replace('?', '')  # Remove question marks
    text = text.replace('"', '')  # Remove double quotes
    text = text.replace("'", '')  # Remove single quotes
    text = text.replace('(', '')  # Remove opening parentheses
    text = text.replace(')', '')  # Remove closing parentheses
    text = text.split()  # Tokenize by whitespace
    return text

# Calculate the prior probabilities
total_samples = len(texts)
positive_samples = sum(labels)
negative_samples = total_samples - positive_samples
prior_positive = positive_samples / total_samples
prior_negative = negative_samples / total_samples
print(f"Prior positive: {prior_positive}")
print(f"Prior negative: {prior_negative}")

# Create a vocabulary of unique words
vocabulary = set()
for text in texts:
    words = preprocess_text(text)
    vocabulary.update(words)
vocabulary = list(vocabulary)

print(vocabulary)

# Calculate word frequencies in positive and negative classes
positive_word_counts = {word: 0 for word in vocabulary}
negative_word_counts = {word: 0 for word in vocabulary}
print(f"positive_word_counts is {positive_word_counts}")
print(f"negative_word_counts is {negative_word_counts}")

for i, text in enumerate(texts): #enumerate generates index and value
    words = preprocess_text(text)
    for word in words:
        if labels[i] == 1:
            positive_word_counts[word] += 1
        else:
            negative_word_counts[word] += 1

# Calculate conditional probabilities (likelihoods)
smooth_factor = 1  # Laplace smoothing to avoid zero probabilities
positive_likelihoods = {}
negative_likelihoods = {}

for word in vocabulary:
    positive_likelihoods[word] = (positive_word_counts[word] + smooth_factor) / (positive_samples + smooth_factor * len(vocabulary))
    negative_likelihoods[word] = (negative_word_counts[word] + smooth_factor) / (negative_samples + smooth_factor * len(vocabulary))

# Classify new text
def classify_text(text):
    words = preprocess_text(text)
    prob_positive = prior_positive
    prob_negative = prior_negative
    
    for word in words:
        if word in vocabulary:
            prob_positive *= positive_likelihoods[word]
            prob_negative *= negative_likelihoods[word]
    
    if prob_positive > prob_negative:
        return 1,prob_negative,prob_positive  # Positive class
    else:
        return 0,prob_negative,prob_positive  # Negative class

# Test the classifier
new_text = "predictable with no fun"
predicted_label,prob_negative,prob_positive = classify_text(new_text)
print(f"The positve probability is {prob_positive}")
print(f"The negative probability is {prob_negative}")
if predicted_label == 1:
    print("Positive sentiment")
else:
    print("Negative sentiment")


['Very Powerful', 'the most fun film of the summer', 'no surprises and very few laughs', 'entirely predictable and lacks energy', 'just plain boring']
Prior positive: 0.4
Prior negative: 0.6
['summer', 'very', 'laughs', 'most', 'the', 'plain', 'few', 'powerful', 'lacks', 'boring', 'and', 'no', 'energy', 'predictable', 'film', 'surprises', 'entirely', 'fun', 'just', 'of']
positive_word_counts is {'summer': 0, 'very': 0, 'laughs': 0, 'most': 0, 'the': 0, 'plain': 0, 'few': 0, 'powerful': 0, 'lacks': 0, 'boring': 0, 'and': 0, 'no': 0, 'energy': 0, 'predictable': 0, 'film': 0, 'surprises': 0, 'entirely': 0, 'fun': 0, 'just': 0, 'of': 0}
negative_word_counts is {'summer': 0, 'very': 0, 'laughs': 0, 'most': 0, 'the': 0, 'plain': 0, 'few': 0, 'powerful': 0, 'lacks': 0, 'boring': 0, 'and': 0, 'no': 0, 'energy': 0, 'predictable': 0, 'film': 0, 'surprises': 0, 'entirely': 0, 'fun': 0, 'just': 0, 'of': 0}
The positve probability is 7.513148009015778e-05
The negative probability is 0.0001972548697

Evaluation

## Logistic Regression (sigmoid activation function)

In [6]:
import numpy as np

# Sample text data (replace with your own dataset)
texts = ["This is a positive sentence.", "Negative sentiment here.", "Another positive example.", "More negative text."]
labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.replace('.', '')  # Remove periods
    text = text.replace(',', '')  # Remove commas
    text = text.replace('!', '')  # Remove exclamation marks
    text = text.replace('?', '')  # Remove question marks
    text = text.replace('"', '')  # Remove double quotes
    text = text.replace("'", '')  # Remove single quotes
    text = text.replace('(', '')  # Remove opening parentheses
    text = text.replace(')', '')  # Remove closing parentheses
    text = text.split()  # Tokenize by whitespace
    return text

# Create a vocabulary of unique words
vocabulary = set()
for text in texts:
    words = preprocess_text(text)
    vocabulary.update(words)
vocabulary = list(vocabulary)

# Create a feature matrix (X) and target vector (y)
X = np.zeros((len(texts), len(vocabulary)))
y = np.array(labels)

# Convert text data into a binary bag-of-words representation
for i, text in enumerate(texts):
    words = preprocess_text(text)
    for j, word in enumerate(vocabulary):
        if word in words:
            X[i][j] = 1

# Initialize weights and bias
num_features = len(vocabulary)
weights = np.zeros(num_features)
bias = 0

# Sigmoid activation function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Train the logistic regression model using gradient descent
learning_rate = 0.01
num_epochs = 1000

for epoch in range(num_epochs):
    # Compute predictions
    predictions = sigmoid(np.dot(X, weights) + bias)
    
    # Compute gradients
    dw = (1 / len(texts)) * np.dot(X.T, (predictions - y))
    db = (1 / len(texts)) * np.sum(predictions - y)
    
    # Update weights and bias
    weights -= learning_rate * dw
    bias -= learning_rate * db

# Classify new text
def classify_text(text):
    words = preprocess_text(text)
    input_features = np.zeros(num_features)
    
    for j, word in enumerate(vocabulary):
        if word in words:
            input_features[j] = 1
    
    prediction = sigmoid(np.dot(input_features, weights) + bias)
    return prediction

# Test the classifier
new_text = "This is a test of the Logistic Regression classifier."
predicted_prob = classify_text(new_text)
if predicted_prob > 0.5:
    print("Positive sentiment")
else:
    print("Negative sentiment")

Positive sentiment


## Logistic Regression (softmax activation function)

In [5]:
import numpy as np

# Sample text data (replace with your own dataset)
texts = ["This is a positive sentence.", "Negative sentiment here.", "Another positive example.", "More negative text."]
labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.replace('.', '')  # Remove periods
    text = text.replace(',', '')  # Remove commas
    text = text.replace('!', '')  # Remove exclamation marks
    text = text.replace('?', '')  # Remove question marks
    text = text.replace('"', '')  # Remove double quotes
    text = text.replace("'", '')  # Remove single quotes
    text = text.replace('(', '')  # Remove opening parentheses
    text = text.replace(')', '')  # Remove closing parentheses
    text = text.split()  # Tokenize by whitespace
    return text

# Create a vocabulary of unique words
vocabulary = set()
for text in texts:
    words = preprocess_text(text)
    vocabulary.update(words)
vocabulary = list(vocabulary)

# Create a feature matrix (X) and target vector (y)
X = np.zeros((len(texts), len(vocabulary)))
y = np.array(labels)

# Convert text data into a binary bag-of-words representation
for i, text in enumerate(texts):
    words = preprocess_text(text)
    for j, word in enumerate(vocabulary):
        if word in words:
            X[i][j] = 1

# Initialize weights and bias
num_features = len(vocabulary)
weights = np.zeros(num_features)
bias = 0

# Softmax activation function
def softmax(z):
    exp_z = np.exp(z - np.max(z))  # Subtract max(z) to prevent overflow
    return exp_z / exp_z.sum(axis=0, keepdims=True)

# Define the cross-entropy loss
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15  # Small constant to avoid division by zero
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip predicted values to prevent log(0)
    return -np.sum(y_true * np.log(y_pred))

# Train the logistic regression model using gradient descent
learning_rate = 0.01
num_epochs = 1000

for epoch in range(num_epochs):
    # Compute predictions
    z = np.dot(X, weights) + bias
    predictions = softmax(z)
    
    # Compute gradients
    dw = (1 / len(texts)) * np.dot(X.T, (predictions - y))
    db = (1 / len(texts)) * np.sum(predictions - y)
    
    # Update weights and bias
    weights -= learning_rate * dw
    bias -= learning_rate * db

# Classify new text
def classify_text(text):
    words = preprocess_text(text)
    input_features = np.zeros(num_features)
    
    for j, word in enumerate(vocabulary):
        if word in words:
            input_features[j] = 1
    
    prediction = softmax(np.dot(input_features, weights) + bias)
    return prediction

# Test the classifier
new_text = "This is a test of the Logistic Regression classifier."
predicted_prob = classify_text(new_text)
if predicted_prob > 0.5:
    print("Positive sentiment")
else:
    print("Negative sentiment")


Positive sentiment


## Logistic Regression (using scikit-learn)

In [15]:
#https://bookdown.org/f_lennert/book-toolbox_css/text-mining.html#tf-idf

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

In [14]:
df = pd.read_csv("imdb_reviews.csv")

ParserError: Error tokenizing data. C error: Expected 5 fields in line 18, saw 7


## TF-IDF Vectorisation

## word2vec