# NLP Assignment No. 2

## Title: Perform Bag-of-Words (BoW) and TF-IDF on Data; Create Word Embeddings using Word2Vec

In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import gensim
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from google.colab import files

In [None]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab') # Download punkt_tab data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:

# Upload dataset from local machine
uploaded = files.upload()

Saving data.csv to data.csv


In [None]:
# Load the dataset
df = pd.read_csv("data.csv")

In [None]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


****

## Preprocessing

In [None]:
# Combine some text fields (make sure they exist in dataset)
text_data = df['Make'].astype(str) + ' ' + df['Model'].astype(str)

In [None]:
# Preprocessing function
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return word_tokenize(text)

In [None]:
# Apply preprocessing
tokenized_text = text_data.apply(preprocess)
cleaned_sentences = tokenized_text.apply(lambda tokens: ' '.join(tokens))

print("\n--- Bag of Words ---\n")


--- Bag of Words ---



## Bag of Words

In [None]:
# Bag of Words (Count)
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(cleaned_sentences)
print("BoW Shape:", bow_matrix.shape)
print("Feature Names:", vectorizer.get_feature_names_out()[:10])

BoW Shape: (11914, 837)
Feature Names: ['100' '124' '150' '1500' '1500hd' '164' '190class' '200' '200h' '200sx']


In [None]:
# Normalized BoW
normalized_bow = bow_matrix.astype(float)
normalized_bow = normalized_bow / normalized_bow.sum(axis=1)

In [None]:
import numpy as np
# Significant tokens
row_array = normalized_bow.toarray()[0]
top_n = 10  # change to 5, 20, etc.
top_indices = np.argsort(row_array)[-top_n:][::-1]
print("Top", top_n, "normalized BoW values with words:")

for idx in top_indices:
    if row_array[idx] > 0:
        print(f"{vectorizer.get_feature_names_out()[idx]}: {row_array[idx]:.4f}")

Top 10 normalized BoW values with words:
series: 0.5000
bmw: 0.5000


In [None]:
top_words = [(vectorizer.get_feature_names_out()[i], row_array[i])
             for i in range(len(row_array)) if row_array[i] > 0]
top_df = pd.DataFrame(top_words, columns=["Word", "Normalized Count"]).sort_values(
    by="Normalized Count", ascending=False)
print(top_df.head(10))  # Show top 10 words

     Word  Normalized Count
0     bmw               0.5
1  series               0.5


## Count Occurrence

In [None]:
from sklearn.preprocessing import normalize

print("\n--- Bag of Words (Count and Normalized Count) ---\n")

# Count Occurrence using CountVectorizer
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(cleaned_sentences)

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Convert to DataFrame for better readability
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=feature_names)

print("▶ Count Occurrence (Raw Frequencies):")
print(bow_df.head())

# Normalized Count Occurrence (relative frequency)
normalized_bow = normalize(bow_matrix, norm='l1', axis=1)
normalized_bow_df = pd.DataFrame(normalized_bow.toarray(), columns=feature_names)

print("\n▶ Normalized Count Occurrence (Per Document):")
print(normalized_bow_df.head())


--- Bag of Words (Count and Normalized Count) ---

▶ Count Occurrence (Raw Frequencies):
   100  124  150  1500  1500hd  164  190class  200  200h  200sx  ...  xuv  xv  \
0    0    0    0     0       0    0         0    0     0      0  ...    0   0   
1    0    0    0     0       0    0         0    0     0      0  ...    0   0   
2    0    0    0     0       0    0         0    0     0      0  ...    0   0   
3    0    0    0     0       0    0         0    0     0      0  ...    0   0   
4    0    0    0     0       0    0         0    0     0      0  ...    0   0   

   yaris  yorker  yukon  z3  z4  z8  zdx  zephyr  
0      0       0      0   0   0   0    0       0  
1      0       0      0   0   0   0    0       0  
2      0       0      0   0   0   0    0       0  
3      0       0      0   0   0   0    0       0  
4      0       0      0   0   0   0    0       0  

[5 rows x 837 columns]

▶ Normalized Count Occurrence (Per Document):
   100  124  150  1500  1500hd  164  190class 

## TF-IDF

In [None]:
tfidf_words = [(tfidf_vectorizer.get_feature_names_out()[i], row_array[i])
               for i in range(len(row_array)) if row_array[i] > 0]

tfidf_df = pd.DataFrame(tfidf_words, columns=["Word", "TF-IDF"]).sort_values(
    by="TF-IDF", ascending=False)

print(tfidf_df.head(10))  # top 10 TF-IDF term

     Word  TF-IDF
0     bmw     0.5
1  series     0.5


In [None]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_sentences)
print("TF-IDF Shape:", tfidf_matrix.shape)
print("TF-IDF Features:", tfidf_vectorizer.get_feature_names_out()[:10])
print("TF-IDF Example Row:", tfidf_matrix[0].toarray())

print("\n--- Word2Vec Embeddings ---\n")

TF-IDF Shape: (11914, 837)
TF-IDF Features: ['100' '124' '150' '1500' '1500hd' '164' '190class' '200' '200h' '200sx']
TF-IDF Example Row: [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.       

## Word2Vec

In [None]:
# Word2Vec expects list of token lists
w2v_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.train(tokenized_text, total_examples=len(tokenized_text), epochs=10)



(187451, 279070)

In [None]:
# Show vector for a word
sample_word = 'toyota'
if sample_word in w2v_model.wv:
    print(f"Embedding for '{sample_word}':")
    print(w2v_model.wv[sample_word])
else:
    print(f"'{sample_word}' not in vocabulary.")

Embedding for 'toyota':
[-0.83651805  0.765233    0.09543639 -0.25942874  0.07348691 -0.40635967
  0.20833191  0.5125161   0.2468348  -0.24788725  0.7082647  -0.13616969
 -0.30543154 -0.15861093  0.6894718  -0.65243894  0.47545528 -0.08364573
 -0.60501236  0.34120792  0.07658801 -0.29290038  0.9703474   0.38040155
 -0.78872526  0.06446248  0.49249184  0.3714193   0.3305891   0.20022264
  0.6316075   0.0398566  -0.0648789  -0.11991361  0.05937047  0.7754014
  0.8571649  -0.17671338 -0.71182644  0.20368646  0.11112657  0.2942762
 -0.04690541 -0.3624351   0.01674363  0.51309323 -0.0414665  -0.23829517
  0.12017268 -0.24056181  0.2862875  -0.31057256 -0.43792793 -0.6237035
 -0.52597225 -0.01992847  0.34733415  0.29020038  0.41874874  0.03796851
  0.33228955  0.37934557  0.7086114  -0.42591372 -0.17796779 -0.24249673
  0.0727411   0.6183386  -0.40913263 -0.6100428   0.20908162 -0.27308542
  0.2826123   0.94911045  0.33703357 -0.07457998 -0.49687034  0.20139861
 -0.9197643   0.06387214 -0.45

### Completed