In [39]:
import pandas as pd
import math

# Sentences
Sentence1 = "The cat sat on the mat."
Sentence2 = "The dog barked at the cat."
Sentence3 = "The bird flew over the cat."
Sentence4 = "The dog chased the bird."
Sentence5 = "The mat is on the floor."

# Convert sentences to lowercase and split into words
def tokenize(sentence):
    sentence = sentence.replace(".", "")
    return sentence.lower().split()

# Tokenize all sentences
words_1, words_2, words_3, words_4, words_5 = tokenize(Sentence1), tokenize(Sentence2), tokenize(Sentence3), tokenize(Sentence4), tokenize(Sentence5)

# Vocabulary (unique words across all sentences)
vocabulary = list(set(words_1 + words_2 + words_3 + words_4 + words_5))

# Initialize a dictionary to hold term frequencies (TF) for all sentences
tf_data = {'Word': vocabulary}

# Calculate TF for each sentence
def calculate_tf(sentence_words, vocab):
    tf = []
    sentence_len = len(sentence_words)
    for word in vocab:
        tf.append(f"{sentence_words.count(word)}/{sentence_len}")  # Format as 'count/total'
    return tf

# Calculate raw TF for numerical TF*IDF calculation
def calculate_raw_tf(sentence_words, vocab):
    tf = []
    sentence_len = len(sentence_words)
    for word in vocab:
        tf.append(sentence_words.count(word) / sentence_len)
    return tf

# Calculate TF for all sentences
tf_data['TF 1'] = calculate_tf(words_1, vocabulary)
tf_data['TF 2'] = calculate_tf(words_2, vocabulary)
tf_data['TF 3'] = calculate_tf(words_3, vocabulary)
tf_data['TF 4'] = calculate_tf(words_4, vocabulary)
tf_data['TF 5'] = calculate_tf(words_5, vocabulary)

# Calculate raw TF for TF-IDF
raw_tf_1 = calculate_raw_tf(words_1, vocabulary)
raw_tf_2 = calculate_raw_tf(words_2, vocabulary)
raw_tf_3 = calculate_raw_tf(words_3, vocabulary)
raw_tf_4 = calculate_raw_tf(words_4, vocabulary)
raw_tf_5 = calculate_raw_tf(words_5, vocabulary)

# Calculate Document Frequency (DF)
def calculate_df(vocab, sentences):
    df = []
    num_sentences = len(sentences)
    for word in vocab:
        count = sum([1 for sentence in sentences if word in sentence])
        df.append(count)
    return df

# Calculate Inverse Document Frequency (IDF) with log transformation
def calculate_idf(df, num_docs):
    return [f"log({num_docs}/{count}) = {0 if count == 0 else round(math.log10(num_docs / count), 3)}" for count in df]

# Calculate raw IDF for numerical TF*IDF calculation
def calculate_raw_idf(df, num_docs):
    return [0 if count == 0 else math.log10(num_docs / count) for count in df]

# Calculate DF and IDF
sentences = [words_1, words_2, words_3, words_4, words_5]
df = calculate_df(vocabulary, sentences)
idf = calculate_idf(df, len(sentences))
raw_idf = calculate_raw_idf(df, len(sentences))

# Add IDF to the dataframe
tf_data['IDF'] = idf

# Calculate TF * IDF for all sentences
tf_data['TF*IDF 1'] = [round(a * i, 3) for a, i in zip(raw_tf_1, raw_idf)]
tf_data['TF*IDF 2'] = [round(b * i, 3) for b, i in zip(raw_tf_2, raw_idf)]
tf_data['TF*IDF 3'] = [round(c * i, 3) for c, i in zip(raw_tf_3, raw_idf)]
tf_data['TF*IDF 4'] = [round(d * i, 3) for d, i in zip(raw_tf_4, raw_idf)]
tf_data['TF*IDF 5'] = [round(e * i, 3) for e, i in zip(raw_tf_5, raw_idf)]

# Convert to DataFrame
df = pd.DataFrame(tf_data)

df

Unnamed: 0,Word,TF 1,TF 2,TF 3,TF 4,TF 5,IDF,TF*IDF 1,TF*IDF 2,TF*IDF 3,TF*IDF 4,TF*IDF 5
0,the,2/6,2/6,2/6,2/5,2/6,log(5/5) = 0.0,0.0,0.0,0.0,0.0,0.0
1,mat,1/6,0/6,0/6,0/5,1/6,log(5/2) = 0.398,0.066,0.0,0.0,0.0,0.066
2,cat,1/6,1/6,1/6,0/5,0/6,log(5/3) = 0.222,0.037,0.037,0.037,0.0,0.0
3,is,0/6,0/6,0/6,0/5,1/6,log(5/1) = 0.699,0.0,0.0,0.0,0.0,0.116
4,floor,0/6,0/6,0/6,0/5,1/6,log(5/1) = 0.699,0.0,0.0,0.0,0.0,0.116
5,on,1/6,0/6,0/6,0/5,1/6,log(5/2) = 0.398,0.066,0.0,0.0,0.0,0.066
6,chased,0/6,0/6,0/6,1/5,0/6,log(5/1) = 0.699,0.0,0.0,0.0,0.14,0.0
7,flew,0/6,0/6,1/6,0/5,0/6,log(5/1) = 0.699,0.0,0.0,0.116,0.0,0.0
8,over,0/6,0/6,1/6,0/5,0/6,log(5/1) = 0.699,0.0,0.0,0.116,0.0,0.0
9,sat,1/6,0/6,0/6,0/5,0/6,log(5/1) = 0.699,0.116,0.0,0.0,0.0,0.0


In [45]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sentences
Sentence1 = 'The sun rises in the east.'
Sentence2 = 'The sun sets in the west.'
Sentence3 = 'The earth revolves around the sun.'
Sentence4 = 'The moon revolves around the earth.'
Sentence5 = 'The stars are visible at night.'

# Convert sentences to lowercase and split into words (remove punctuation as well)
def tokenize(sentence):
    sentence = sentence.replace(".", "")  # Remove period
    return sentence.lower().split()

# Tokenize all sentences
sentences = [Sentence1, Sentence2, Sentence3, Sentence4, Sentence5]
tokenized_sentences = [tokenize(sentence) for sentence in sentences]
print('tokenized_sentences', tokenized_sentences)

# Create the vocabulary (unique words across all sentences)
vocabulary = sorted(list(set([word for sentence in tokenized_sentences for word in sentence])))
print('vocabulary', vocabulary)

# Create the Bag of Words (BoW) vector for each sentence
def create_bow_vector(sentence_words, vocab):
    vector = [sentence_words.count(word) for word in vocab]
    return vector

# Create the BoW vectors for all sentences
bow_vectors = [create_bow_vector(sentence, vocabulary) for sentence in tokenized_sentences]

# Convert BoW vectors to a DataFrame for a cleaner view
df_bow = pd.DataFrame(bow_vectors, columns=vocabulary, index=[f'Sentence {i+1}' for i in range(len(sentences))])

# Display the Bag of Words vectors
print("Part 1: Bag of Words Vectors")
df_bow


Part 1: Bag of Words Vectors


Unnamed: 0,are,around,at,earth,east,in,moon,night,revolves,rises,sets,stars,sun,the,visible,west
Sentence 1,0,0,0,0,1,1,0,0,0,1,0,0,1,2,0,0
Sentence 2,0,0,0,0,0,1,0,0,0,0,1,0,1,2,0,1
Sentence 3,0,1,0,1,0,0,0,0,1,0,0,0,1,2,0,0
Sentence 4,0,1,0,1,0,0,1,0,1,0,0,0,0,2,0,0
Sentence 5,1,0,1,0,0,0,0,1,0,0,0,1,0,1,1,0


In [46]:
# Function to calculate cosine similarity
def cosine_sim(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Calculate cosine similarities
cos_sim_1_2 = cosine_sim(bow_vectors[0], bow_vectors[1])  # Sentence 1 and Sentence 2
cos_sim_1_5 = cosine_sim(bow_vectors[0], bow_vectors[4])  # Sentence 1 and Sentence 5
cos_sim_3_4 = cosine_sim(bow_vectors[2], bow_vectors[3])  # Sentence 3 and Sentence 4

# Display the results
print("\nPart 2: Cosine Similarity")
print(f"Cosine Similarity between Sentence 1 and Sentence 2: {cos_sim_1_2:.3f}")
print(f"Cosine Similarity between Sentence 1 and Sentence 5: {cos_sim_1_5:.3f}")
print(f"Cosine Similarity between Sentence 3 and Sentence 4: {cos_sim_3_4:.3f}")



Part 2: Cosine Similarity
Cosine Similarity between Sentence 1 and Sentence 2: 0.750
Cosine Similarity between Sentence 1 and Sentence 5: 0.289
Cosine Similarity between Sentence 3 and Sentence 4: 0.875


Logistic Regression with handcrafted features  

In [4]:
text_passage="""What truly sets this book apart is the depth of emotion it evokes. I laughed, I cried, and I
felt my heart race with anticipation during the most gripping moments. The themes
explored in this story are both timely and timeless, touching on the complexities of
human nature, the power of friendship, and the triumph of hope in the face of adversity. I
cannot recommend this book enough. It is a masterpiece of modern literature that
deserves a place on every bookshelf. Whether you're a seasoned reader or just looking
for a captivating story to dive into, this book will not disappoint. Prepare to be
transported on an unforgettable journey that will stay with you for a lifetime."""

# Each training observation would be represented by the 5 crafted features shown in the following table.

# Features

# 1. log of Word count

# 2. number of punctuations (period, comma, apostrophe, quotation, question, exclamation, colon,
# etc.)

# 3. number of positive words

# 4. number of negative words

# 5. ratio of capitalized words (words starting with capital letter) to total words

# Dictionary for positive and negative words is given below:
import re
import math
import string

# Dictionary for positive and negative words
Positivedictionary = ['Good', 'unforgettable', 'masterpiece', 'depth', 'laughed', 'timeless', 'captivating', 'happy', 'triumph', 'friendship', 'modern', 'lifetime', 'gripping', 'enjoy', 'proud']
Negativedictionary = ['Not', 'cannot', 'disappoint', 'sad', 'cried', 'hopeless', 'adversity', 'waste', 'weird', 'complexities', 'anger', 'seasoned', 'anticipation', 'bad', 'rude']

# Split text while keeping punctuation
tokens = re.findall(r'\b\w+\b|[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', text_passage)
print(string.punctuation)
# Calculate features
x1 = math.log(len(tokens))
x2 = sum(text_passage.count(p) for p in string.punctuation)
x3 = sum([text_passage.count(word) for word in Positivedictionary])
x4 = sum([text_passage.count(word) for word in Negativedictionary])
x5 = sum([1 for word in tokens if word[0].isupper()]) / len(tokens)

# Print tokens to verify
print(tokens,'\n',len(tokens))

# Print feature values
print(f"x1 (log of Word count): {x1}")
print(f"x2 (number of punctuations): {x2}")
print(f"x3 (number of positive words): {x3}")
print(f"x4 (number of negative words): {x4}")
print(f"x5 (ratio of capitalized words to total words): {x5}")
# Your task is to update the weights once, using a stochastic gradient descent algorithm. Assume all initial
# weights set to 0.2 and learning rate Î±=0.5. Also compute the Loss function: binary cross entropy loss.

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['What', 'truly', 'sets', 'this', 'book', 'apart', 'is', 'the', 'depth', 'of', 'emotion', 'it', 'evokes', '.', 'I', 'laughed', ',', 'I', 'cried', ',', 'and', 'I', 'felt', 'my', 'heart', 'race', 'with', 'anticipation', 'during', 'the', 'most', 'gripping', 'moments', '.', 'The', 'themes', 'explored', 'in', 'this', 'story', 'are', 'both', 'timely', 'and', 'timeless', ',', 'touching', 'on', 'the', 'complexities', 'of', 'human', 'nature', ',', 'the', 'power', 'of', 'friendship', ',', 'and', 'the', 'triumph', 'of', 'hope', 'in', 'the', 'face', 'of', 'adversity', '.', 'I', 'cannot', 'recommend', 'this', 'book', 'enough', '.', 'It', 'is', 'a', 'masterpiece', 'of', 'modern', 'literature', 'that', 'deserves', 'a', 'place', 'on', 'every', 'bookshelf', '.', 'Whether', 'you', "'", 're', 'a', 'seasoned', 'reader', 'or', 'just', 'looking', 'for', 'a', 'captivating', 'story', 'to', 'dive', 'into', ',', 'this', 'book', 'will', 'not', 'disappoint', '.', 'Prepare', 'to', 

In [8]:
# Extract capitalized words
capitalized_words = [word for word in tokens if word[0].isupper()]

# Count the capitalized words
capitalized_count = len(capitalized_words)

# Display the list and count
print("Capitalized Words:", capitalized_words)
print("Count of Capitalized Words:", capitalized_count)

Capitalized Words: ['What', 'I', 'I', 'I', 'The', 'I', 'It', 'Whether', 'Prepare']
Count of Capitalized Words: 9


In [5]:
import numpy as np
w= np.array([0.2 for i in range(5)])
alpha=0.5
w

array([0.2, 0.2, 0.2, 0.2, 0.2])

In [6]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def binary_cross_entropy_loss(y_pred, y_true):
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Compute the predicted value
y_pred_true = sigmoid(np.dot(w, np.array([x1, x2, x3, x4, x5])))

# weight update
w = w - alpha * (y_pred_true - 1) * np.array([x1, x2, x3, x4, x5])

loss_slide = 1 - sigmoid(np.dot(w, np.array([x1, x2, x3, x4, x5])))
loss = binary_cross_entropy_loss(y_pred_true, 1)

y_pred_true, loss_slide, loss, w

(0.9993839730694641,
 0.0005463499798894489,
 0.0006162167530866405,
 array([0.20150629, 0.20431219, 0.20338815, 0.20215609, 0.20002084]))

In [7]:
# 