# CountVectorizer, TfidfVectorizer

In [3]:
!pip install scikit-learn



In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd


# Using CountVectorizer

In [65]:
corpus = [
    'This is the first document',
    'ArithmeticError is a built-in exception in Python',
    'This document is the second document',
    'And this is the third one',
    'Is this the first document?',
]

In [None]:
vectorizer = CountVectorizer

In [None]:
x = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(x.toarray()) 

['and' 'arithmeticerror' 'built' 'document' 'exception' 'first' 'in' 'is'
 'one' 'python' 'second' 'the' 'third' 'this']
[[0 0 0 1 0 1 0 1 0 0 0 1 0 1]
 [0 1 1 0 1 0 2 1 0 1 0 0 0 0]
 [0 0 0 2 0 0 0 1 0 0 1 1 0 1]
 [1 0 0 0 0 0 0 1 1 0 0 1 1 1]
 [0 0 0 1 0 1 0 1 0 0 0 1 0 1]]


In [17]:
print(x.toarray().shape) 
print(len(vectorizer.get_feature_names_out()))

(5, 14)
14


In [15]:
df = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
print(df)

   and  arithmeticerror  built  document  exception  first  in  is  one  \
0    0                0      0         1          0      1   0   1    0   
1    0                1      1         0          1      0   2   1    0   
2    0                0      0         2          0      0   0   1    0   
3    1                0      0         0          0      0   0   1    1   
4    0                0      0         1          0      1   0   1    0   

   python  second  the  third  this  
0       0       0    1      0     1  
1       1       0    0      0     0  
2       0       1    1      0     1  
3       0       0    1      1     1  
4       0       0    1      0     1  


# Using TfidfVectorizer

In [66]:
vectorize = TfidfVectorizer()

In [67]:
y = vectorize.fit_transform(corpus)
print(vectorize.get_feature_names_out())
print(y)

['and' 'arithmeticerror' 'built' 'document' 'exception' 'first' 'in' 'is'
 'one' 'python' 'second' 'the' 'third' 'this']
  (0, 13)	0.4022840626476948
  (0, 7)	0.3402492835693104
  (0, 11)	0.4022840626476948
  (0, 5)	0.5760921151629192
  (0, 3)	0.4782084961154663
  (1, 7)	0.1661290286861683
  (1, 1)	0.34864042110528976
  (1, 2)	0.34864042110528976
  (1, 6)	0.6972808422105795
  (1, 4)	0.34864042110528976
  (1, 9)	0.34864042110528976
  (2, 13)	0.2946493799843241
  (2, 7)	0.24921255836974746
  (2, 11)	0.2946493799843241
  (2, 3)	0.7005191105820987
  (2, 10)	0.5230005374851706
  (3, 13)	0.2866852129249319
  (3, 7)	0.24247651688117958
  (3, 11)	0.2866852129249319
  (3, 0)	0.5088641980402838
  (3, 12)	0.5088641980402838
  (3, 8)	0.5088641980402838
  (4, 13)	0.4022840626476948
  (4, 7)	0.3402492835693104
  (4, 11)	0.4022840626476948
  (4, 5)	0.5760921151629192
  (4, 3)	0.4782084961154663


In [68]:
print(y.toarray().shape)
print(len(vectorize.get_feature_names_out()))

(5, 14)
14


In [69]:
y = pd.DataFrame(y.toarray(), columns=vectorize.get_feature_names_out())
print(y)

        and  arithmeticerror    built  document  exception     first  \
0  0.000000          0.00000  0.00000  0.478208    0.00000  0.576092   
1  0.000000          0.34864  0.34864  0.000000    0.34864  0.000000   
2  0.000000          0.00000  0.00000  0.700519    0.00000  0.000000   
3  0.508864          0.00000  0.00000  0.000000    0.00000  0.000000   
4  0.000000          0.00000  0.00000  0.478208    0.00000  0.576092   

         in        is       one   python    second       the     third  \
0  0.000000  0.340249  0.000000  0.00000  0.000000  0.402284  0.000000   
1  0.697281  0.166129  0.000000  0.34864  0.000000  0.000000  0.000000   
2  0.000000  0.249213  0.000000  0.00000  0.523001  0.294649  0.000000   
3  0.000000  0.242477  0.508864  0.00000  0.000000  0.286685  0.508864   
4  0.000000  0.340249  0.000000  0.00000  0.000000  0.402284  0.000000   

       this  
0  0.402284  
1  0.000000  
2  0.294649  
3  0.286685  
4  0.402284  


# TF(t,d)= Number of times term t appears in document d / Total number of terms in document d
 # IDF(t)=log( Total number of documents N / Number of documents containing term t)
​
 

In [None]:
with open(r"G:\GEN AI\NLP\requirements.txt", "r", encoding="utf-8") as file:
    text = file.readlines() # List of lines (documents) Used when document is not arranged in lines
print(text)

['mkdir -p ~/projects : -\n', '\n', 'Copy your Windows project to Linux:\n', 'cp -rv /mnt/h/POC_env_linx ~/projects/\n', '\n', 'ls -l ~/projects/POC_env_linx\n', '\n', 'ls -la ~/projects/POC_env_linx (hidden files)\n', '\n', '\n', 'du -sh ~/projects/POC_env_linx  # Show total size\n', '\n', 'cp -r ~/projects/POC_env_linx /mnt/h/POC_env_linx_backup\n', '\n', '\n', '1. mAP50 (mAP@0.50)\n', '\n', '    Definition:\n', '\n', '        mAP50 calculates the mean Average Precision at an IoU (Intersection over Union) threshold of 0.50.\n', '\n', '        This means a predicted bounding box is considered a correct detection if it overlaps with the ground truth box by at least 50%.\n', '\n', '    When to Use:\n', '\n', '        A less strict metric, useful for general object detection where precise localization isn’t critical.\n', '\n', '        Often higher than mAP50-95 because it tolerates imperfect bounding boxes.\n', '\n', '    Example:\n', '\n', '        If your model has mAP50 = 0.85, it me

In [71]:
text

['mkdir -p ~/projects : -\n',
 '\n',
 'Copy your Windows project to Linux:\n',
 'cp -rv /mnt/h/POC_env_linx ~/projects/\n',
 '\n',
 'ls -l ~/projects/POC_env_linx\n',
 '\n',
 'ls -la ~/projects/POC_env_linx (hidden files)\n',
 '\n',
 '\n',
 'du -sh ~/projects/POC_env_linx  # Show total size\n',
 '\n',
 'cp -r ~/projects/POC_env_linx /mnt/h/POC_env_linx_backup\n',
 '\n',
 '\n',
 '1. mAP50 (mAP@0.50)\n',
 '\n',
 '    Definition:\n',
 '\n',
 '        mAP50 calculates the mean Average Precision at an IoU (Intersection over Union) threshold of 0.50.\n',
 '\n',
 '        This means a predicted bounding box is considered a correct detection if it overlaps with the ground truth box by at least 50%.\n',
 '\n',
 '    When to Use:\n',
 '\n',
 '        A less strict metric, useful for general object detection where precise localization isn’t critical.\n',
 '\n',
 '        Often higher than mAP50-95 because it tolerates imperfect bounding boxes.\n',
 '\n',
 '    Example:\n',
 '\n',
 '        If you

Calculation of Cosine and Euclidean

In [72]:
import numpy as np

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

A = np.array([0.4, 0.5, 0.6])
B = np.array([0.7, 0.8, 0.9])

print(cosine_similarity(A, B))  # Output: ~0.999 (≈1.0)

0.9981908926857267


In [73]:
import numpy as np

A = np.array([0.4, 0.5, 0.6])
B = np.array([0.7, 0.8, 0.9])

# Euclidean Distance
euclidean = np.linalg.norm(A - B)  # Output: ~0.52

# Cosine Similarity
cosine = np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))  # Output: ~1.0