## N-gram Models (without using any libraries)

In [46]:
class Ngram:
    def __init__(self, text, n=None):
        self.text = text
        self.n = n
        self.split_text = self.process()

    def process(self):
        text = self.text.lower()
        text = text.replace('.', ' ')
        text = text.replace(',', ' ')
        split_text = text.split()
        return split_text

    def ngram_context(self):
        ngram_list, context_list  = [], []
        vocab = set()
        n = self.n
        for i in range(0, len(self.split_text) - n + 1):
            ngram = self.split_text[i:i + n]
            context = self.split_text[i:i + n - 1]
            context_list.append(context)
            ngram_list.append(ngram)
        for i in self.split_text:
             vocab.update([i])
        vocab = list(vocab)
            
        return ngram_list, context_list, vocab

    def counts(self, ingram):
        ngramcount = 0
        contextcount = 0
        ngram_list = self.ngram_context()[0]
        context_list = self.ngram_context()[1]

        for i in ngram_list:
            if i == ingram:
                ngramcount += 1

        for i in context_list:
            if i == ingram[:-1]:
                contextcount += 1

        return ngramcount, contextcount

    def probability(self, ingram): #calculates the normal proability of the ngram
        ngram_list, context_list,vocab = self.ngram_context()
        ncount, ccount = self.counts(ingram)

        # Check for division by zero
        if ccount == 0:
            return 0.0

        prob = ncount / ccount
        return prob
    
    def probability_laplace(self, ingram): #calculates the laplace proability of the ngram
        ngram_list, context_list, vocab = self.ngram_context()
        ncount, ccount = self.counts(ingram)
        
        lprob = (ncount +1)/ (ccount + len(vocab))
        return lprob
    
    def probability_addk(self, ingram, k): #calculates the add-k proability of the ngram
        ngram_list, context_list, vocab = self.ngram_context()
        ncount, ccount = self.counts(ingram)
        
        kprob = (ncount + k)/ (ccount + k*len(vocab))
        return kprob
        

    def perplexity(self): #calculates perplexity using normal probability
        ngram_list = self.ngram_context()
        prob = 1
        for i in ngram_list:
            prob_i = self.probability(i)

            # Check if prob_i is 0.0, and if so, assign a small positive value (e.g., 1e-10)
            if prob_i == 0.0:
                prob_i = 1e-10

            prob = prob * prob_i

        # Check if prob is still 0.0 after the loop
        if prob == 0.0:
            return float('inf')  # Return infinity to indicate undefined perplexity
        else:
            perp = prob ** (-1 / self.n)
            return perp

In [47]:
text = "This is a test. It has some punctuation, this like commas and  this periods."
ngram = Ngram(text, n=2)
ngram_list, context_list, vocab = ngram.ngram_context()
print(ngram_list)
print(context_list)
print(vocab)
ngramcount  = ngram.counts([ 'this', 'is'])[0]
contextcount = ngram.counts([ 'this', 'is'])[1]
print(ngramcount) 
print(contextcount)
prob = ngram.probability(['this', 'is'])
lprob = ngram.probability_laplace(['this', 'is'])
kprob = ngram.probability_addk(['this', 'is'], 0.5)
print(prob)
print(lprob)
print(kprob)

[['this', 'is'], ['is', 'a'], ['a', 'test'], ['test', 'it'], ['it', 'has'], ['has', 'some'], ['some', 'punctuation'], ['punctuation', 'this'], ['this', 'like'], ['like', 'commas'], ['commas', 'and'], ['and', 'this'], ['this', 'periods']]
[['this'], ['is'], ['a'], ['test'], ['it'], ['has'], ['some'], ['punctuation'], ['this'], ['like'], ['commas'], ['and'], ['this']]
['some', 'like', 'commas', 'a', 'it', 'periods', 'is', 'test', 'has', 'punctuation', 'and', 'this']
1
3
0.3333333333333333
0.13333333333333333
0.16666666666666666


## Naive-Bayes (with laplace smoothing)

In [51]:
import math

# Sample text data (replace with your own dataset)
texts = ["Very Powerful", "the most fun film of the summer", "no surprises and very few laughs", "entirely predictable and lacks energy","just plain boring"]
print(texts)
labels = [1, 1, 0, 0,0]  # 1 for positive, 0 for negative

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.replace('.', '')  # Remove periods
    text = text.replace(',', '')  # Remove commas
    text = text.replace('!', '')  # Remove exclamation marks
    text = text.replace('?', '')  # Remove question marks
    text = text.replace('"', '')  # Remove double quotes
    text = text.replace("'", '')  # Remove single quotes
    text = text.replace('(', '')  # Remove opening parentheses
    text = text.replace(')', '')  # Remove closing parentheses
    text = text.split()  # Tokenize by whitespace
    return text

# Calculate the prior probabilities
total_samples = len(texts)
positive_samples = sum(labels)
negative_samples = total_samples - positive_samples
prior_positive = positive_samples / total_samples
prior_negative = negative_samples / total_samples
print(f"Prior positive: {prior_positive}")
print(f"Prior negative: {prior_negative}")

# Create a vocabulary of unique words
vocabulary = set()
for text in texts:
    words = preprocess_text(text)
    vocabulary.update(words)
vocabulary = list(vocabulary)

print(vocabulary)

# Calculate word frequencies in positive and negative classes
positive_word_counts = {word: 0 for word in vocabulary}
negative_word_counts = {word: 0 for word in vocabulary}
print(f"positive_word_counts is {positive_word_counts}")
print(f"negative_word_counts is {negative_word_counts}")

for i, text in enumerate(texts): #enumerate generates index and value
    words = preprocess_text(text)
    for word in words:
        if labels[i] == 1:
            positive_word_counts[word] += 1
        else:
            negative_word_counts[word] += 1

# Calculate conditional probabilities (likelihoods)
smooth_factor = 1  # Laplace smoothing to avoid zero probabilities
positive_likelihoods = {}
negative_likelihoods = {}

for word in vocabulary:
    positive_likelihoods[word] = (positive_word_counts[word] + smooth_factor) / (positive_samples + smooth_factor * len(vocabulary))
    negative_likelihoods[word] = (negative_word_counts[word] + smooth_factor) / (negative_samples + smooth_factor * len(vocabulary))

# Classify new text
def classify_text(text):
    words = preprocess_text(text)
    prob_positive = prior_positive
    prob_negative = prior_negative
    
    for word in words:
        if word in vocabulary:
            prob_positive *= positive_likelihoods[word]
            prob_negative *= negative_likelihoods[word]
    
    if prob_positive > prob_negative:
        return 1,prob_negative,prob_positive  # Positive class
    else:
        return 0,prob_negative,prob_positive  # Negative class

# Test the classifier
new_text = "predictable with no fun"
predicted_label,prob_negative,prob_positive = classify_text(new_text)
print(f"The positve probability is {prob_positive}")
print(f"The negative probability is {prob_negative}")
if predicted_label == 1:
    print("Positive sentiment")
else:
    print("Negative sentiment")


['Very Powerful', 'the most fun film of the summer', 'no surprises and very few laughs', 'entirely predictable and lacks energy', 'just plain boring']
Prior positive: 0.4
Prior negative: 0.6
['summer', 'very', 'laughs', 'most', 'the', 'plain', 'few', 'powerful', 'lacks', 'boring', 'and', 'no', 'energy', 'predictable', 'film', 'surprises', 'entirely', 'fun', 'just', 'of']
positive_word_counts is {'summer': 0, 'very': 0, 'laughs': 0, 'most': 0, 'the': 0, 'plain': 0, 'few': 0, 'powerful': 0, 'lacks': 0, 'boring': 0, 'and': 0, 'no': 0, 'energy': 0, 'predictable': 0, 'film': 0, 'surprises': 0, 'entirely': 0, 'fun': 0, 'just': 0, 'of': 0}
negative_word_counts is {'summer': 0, 'very': 0, 'laughs': 0, 'most': 0, 'the': 0, 'plain': 0, 'few': 0, 'powerful': 0, 'lacks': 0, 'boring': 0, 'and': 0, 'no': 0, 'energy': 0, 'predictable': 0, 'film': 0, 'surprises': 0, 'entirely': 0, 'fun': 0, 'just': 0, 'of': 0}
The positve probability is 7.513148009015778e-05
The negative probability is 0.0001972548697

## Logistic Regression (sigmoid activation function, gradient descent)

In [55]:
import numpy as np

# Sample text data (replace with your own dataset)
texts = ["This is a positive sentence.", "Negative sentiment here.", "Another positive example.", "More negative text."]
labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.replace('.', '')  # Remove periods
    text = text.replace(',', '')  # Remove commas
    text = text.replace('!', '')  # Remove exclamation marks
    text = text.replace('?', '')  # Remove question marks
    text = text.replace('"', '')  # Remove double quotes
    text = text.replace("'", '')  # Remove single quotes
    text = text.replace('(', '')  # Remove opening parentheses
    text = text.replace(')', '')  # Remove closing parentheses
    text = text.split()  # Tokenize by whitespace
    return text

# Create a vocabulary of unique words
vocabulary = set()
for text in texts:
    words = preprocess_text(text)
    vocabulary.update(words)
vocabulary = list(vocabulary)
print(vocabulary)

# Create a feature matrix (X) and target vector (y)
X = np.zeros((len(texts), len(vocabulary))) #text is number of rows and vocabulary is number of columns
y = np.array(labels)

print(X)
print(y)

# Convert text data into a binary bag-of-words representation
for i, text in enumerate(texts): #gives index and value
    words = preprocess_text(text)
    for j, word in enumerate(vocabulary):
        if word in words:
            X[i][j] = 1

print(X)

# Initialize weights and bias
num_features = len(vocabulary) #we create number of features equal to length of vocabulary for simplicity
weights = np.zeros(num_features)
bias = 0

# Sigmoid activation function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Train the logistic regression model using gradient descent
learning_rate = 0.01 #The learning rate determines how much weights and bias are updated during each iteration of gradient descent.
num_epochs = 1000 #The number of epochs determines how many times the model will loop through the entire training dataset.

for epoch in range(num_epochs):
    # Compute predictions
    predictions = sigmoid(np.dot(X, weights) + bias) #sigma(w.x+b)
    
    # Compute gradients
    dw = (1 / len(texts)) * np.dot(X.T, (predictions - y))
    db = (1 / len(texts)) * np.sum(predictions - y)
    
    # Update weights and bias
    weights -= learning_rate * dw
    bias -= learning_rate * db

# Classify new text
def classify_text(text):
    words = preprocess_text(text)
    input_features = np.zeros(num_features)
    
    for j, word in enumerate(vocabulary):
        if word in words:
            input_features[j] = 1
    
    prediction = sigmoid(np.dot(input_features, weights) + bias)
    return prediction

# Test the classifier
new_text = "This is a test of the Logistic Regression classifier."
predicted_prob = classify_text(new_text)
if predicted_prob > 0.5:
    print("Positive sentiment")
else:
    print("Negative sentiment")

['text', 'a', 'sentiment', 'example', 'positive', 'negative', 'here', 'another', 'is', 'sentence', 'more', 'this']
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[1 0 1 0]
[[0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1.]
 [0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.]]
Positive sentiment


## Logistic Regression (softmax activation function)

In [56]:
import numpy as np

# Sample text data (replace with your own dataset)
texts = ["This is a positive sentence.", "Negative sentiment here.", "Another positive example.", "More negative text."]
labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.replace('.', '')  # Remove periods
    text = text.replace(',', '')  # Remove commas
    text = text.replace('!', '')  # Remove exclamation marks
    text = text.replace('?', '')  # Remove question marks
    text = text.replace('"', '')  # Remove double quotes
    text = text.replace("'", '')  # Remove single quotes
    text = text.replace('(', '')  # Remove opening parentheses
    text = text.replace(')', '')  # Remove closing parentheses
    text = text.split()  # Tokenize by whitespace
    return text

# Create a vocabulary of unique words
vocabulary = set()
for text in texts:
    words = preprocess_text(text)
    vocabulary.update(words)
vocabulary = list(vocabulary)

# Create a feature matrix (X) and target vector (y)
X = np.zeros((len(texts), len(vocabulary)))
y = np.array(labels)

# Convert text data into a binary bag-of-words representation
for i, text in enumerate(texts):
    words = preprocess_text(text)
    for j, word in enumerate(vocabulary):
        if word in words:
            X[i][j] = 1

# Initialize weights and bias
num_features = len(vocabulary)
weights = np.zeros(num_features)
bias = 0

# Softmax activation function
def softmax(z):
    exp_z = np.exp(z) 
    return exp_z / exp_z.sum(axis=0, keepdims=True)

# Define the cross-entropy loss
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15  # Small constant to avoid division by zero
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip predicted values to prevent log(0)
    return -np.sum(y_true * np.log(y_pred))

# Train the logistic regression model using gradient descent
learning_rate = 0.01
num_epochs = 1000

for epoch in range(num_epochs):
    # Compute predictions
    z = np.dot(X, weights) + bias
    predictions = softmax(z)
    
    # Compute gradients
    dw = (1 / len(texts)) * np.dot(X.T, (predictions - y))
    db = (1 / len(texts)) * np.sum(predictions - y)
    
    # Update weights and bias
    weights -= learning_rate * dw
    bias -= learning_rate * db

# Classify new text
def classify_text(text):
    words = preprocess_text(text)
    input_features = np.zeros(num_features)
    
    for j, word in enumerate(vocabulary):
        if word in words:
            input_features[j] = 1
    
    prediction = softmax(np.dot(input_features, weights) + bias)
    return prediction

# Test the classifier
new_text = "This is a test of the Logistic Regression classifier."
predicted_prob = classify_text(new_text)
if predicted_prob > 0.5:
    print("Positive sentiment")
else:
    print("Negative sentiment")


Positive sentiment


## TF-IDF Vectorization
the tf-idf
 model, an important baseline, the meaning of a word is defifined by a simple function
 of the counts of nearby words

In [1]:
text_1 = [
    "Natural language processing has its roots in the 1950s.",
    "Already in 1950, Alan Turing published an article titled 'Computing Machinery and Intelligence'",
    "It proposed what is now called the Turing test as a criterion of intelligence",
    "though at the time that was not articulated as a problem separate from artificial intelligence."
    "The proposed test includes a task that involves the automated interpretation and generation of natural language."
    ]

In [2]:
def vocab(corpus):
    vocab = []
    for sentence in corpus:
        for word in sentence.split(' '):
            if len(word)>1 and word not in vocab:
                vocab.append(word)

    word_dimension = {j:i for i,j in enumerate(vocab)}
    return word_dimension

In [3]:
word_dimension = vocab(text_1)

In [4]:
word_dimension

{'Natural': 0,
 'language': 1,
 'processing': 2,
 'has': 3,
 'its': 4,
 'roots': 5,
 'in': 6,
 'the': 7,
 '1950s.': 8,
 'Already': 9,
 '1950,': 10,
 'Alan': 11,
 'Turing': 12,
 'published': 13,
 'an': 14,
 'article': 15,
 'titled': 16,
 "'Computing": 17,
 'Machinery': 18,
 'and': 19,
 "Intelligence'": 20,
 'It': 21,
 'proposed': 22,
 'what': 23,
 'is': 24,
 'now': 25,
 'called': 26,
 'test': 27,
 'as': 28,
 'criterion': 29,
 'of': 30,
 'intelligence': 31,
 'though': 32,
 'at': 33,
 'time': 34,
 'that': 35,
 'was': 36,
 'not': 37,
 'articulated': 38,
 'problem': 39,
 'separate': 40,
 'from': 41,
 'artificial': 42,
 'intelligence.The': 43,
 'includes': 44,
 'task': 45,
 'involves': 46,
 'automated': 47,
 'interpretation': 48,
 'generation': 49,
 'natural': 50,
 'language.': 51}

In [5]:
def word_count(corpus, word):
    count = 0
    for sentence in corpus:
        if word in sentence:
            count = count+1
    return count

In [10]:
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
import numpy as np

def transform(corpus, word_dimension):
    # Initialize lists to store document indices, column indices, and TF-IDF values
    documents = []
    columns = []
    tf_idf_values = []

    # Iterate over each document in the corpus
    for index, document in enumerate(corpus):
        # Calculate word frequencies in the current document
        word_frequency = Counter(document.split())

        # Iterate over unique words and their frequencies in the document
        for word, freq in word_frequency.items():
            # Skip short words
            if len(word) < 2:
                continue

            # Get the index of the word from word_dimension, default to -1 if not found
            dimension_index = word_dimension.get(word, -1) #how many times the word is in a document

            # If the word is in word_dimension, calculate its TF-IDF and store information
            if dimension_index != -1:
                documents.append(index)  # Document index
                columns.append(dimension_index)  # Word index in vocabulary

                # Calculate TF-IDF using the formula
                tf_idf_value = (freq / len(document.split())) * (1 + (np.log((1 + len(corpus)) / (1 + word_dimension[word]))))

                tf_idf_values.append(tf_idf_value)

    # Create a sparse matrix using CSR format
    # number of rows is equal to the number of documents in the corpus, and the number of columns is equal to the number of unique words in the vocabulary defined by the word_dimension dictionary.
    sparse_matrix = csr_matrix((tf_idf_values, (documents, columns)), shape=(len(corpus), len(word_dimension)))

    # Normalize the TF-IDF matrix
    final_normalized = normalize(sparse_matrix)
    print(word_frequency)
    print(documents)
    print(columns)

    return final_normalized


In [11]:
vals = transform(text_1, word_dimension)

Counter({'the': 2, 'that': 2, 'a': 2, 'though': 1, 'at': 1, 'time': 1, 'was': 1, 'not': 1, 'articulated': 1, 'as': 1, 'problem': 1, 'separate': 1, 'from': 1, 'artificial': 1, 'intelligence.The': 1, 'proposed': 1, 'test': 1, 'includes': 1, 'task': 1, 'involves': 1, 'automated': 1, 'interpretation': 1, 'and': 1, 'generation': 1, 'of': 1, 'natural': 1, 'language.': 1})
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 7, 12, 27, 28, 29, 30, 31, 32, 33, 7, 34, 35, 36, 37, 38, 28, 39, 40, 41, 42, 43, 22, 27, 44, 45, 46, 47, 48, 19, 49, 30, 50, 51]


In [22]:
print(vals)

  (0, 0)	0.6362648423548146
  (0, 1)	0.4672532788046143
  (0, 2)	0.3683878519394966
  (0, 3)	0.29824171525441406
  (0, 4)	0.24383214458676375
  (0, 5)	0.19937628838929639
  (0, 6)	0.16178939753750837
  (0, 7)	0.12923015170421384
  (0, 8)	0.10051086152417864
  (1, 6)	0.6023037709341442
  (1, 9)	0.27853937765828185
  (1, 10)	0.1920235091854428
  (1, 11)	0.1130407094180897
  (1, 12)	0.040383576848297284
  (1, 13)	-0.026886420803614346
  (1, 14)	-0.08951329032986134
  (1, 15)	-0.14809681433152597
  (1, 16)	-0.20312757618775612
  (1, 17)	-0.2550119585700533
  (1, 18)	-0.3040903740521614
  (1, 19)	-0.35065081407947685
  (1, 20)	-0.39493908879175754
  (2, 7)	0.22568480161358573
  (2, 12)	0.018944263125366588
  (2, 21)	-0.20507843312209903
  :	:
  (3, 19)	-0.06815960845379375
  (3, 22)	-0.09281986816200104
  (3, 27)	-0.12752836505999324
  (3, 28)	-0.13372004389146525
  (3, 30)	-0.14548738618426157
  (3, 32)	-0.1565187743719772
  (3, 33)	-0.16178617271962303
  (3, 34)	-0.16690087115216784
  (3,

## Logistic Regression (using scikit-learn, needs TF-IDF vecotrization definitions)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

In [66]:
file_path = 'imdb_reviews.csv'
df = pd.read_csv(file_path, encoding='utf-8', delimiter='\t', quotechar="'", escapechar='\\', header=None, names=['review', 'label'])

In [67]:
df1 = df[:500]
df1.head()

Unnamed: 0,review,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [69]:
def vocab1(corpus):
    vocab = []
    for sentence in corpus:
        for word in sentence.split(' '):
            if len(word)>1 and word not in vocab:
                vocab.append(word)

    word_dimension = {j:i for i,j in enumerate(vocab)}
    return word_dimension

In [70]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df1['review'])
word_dimesn = vocab1(df1['review'])
X_ = transform(df1['review'], word_dimesn)

NameError: name 'transform' is not defined

## word2vec