# CountVectorizer and TF-IDF Vectorizer

In [17]:
# imports

import sys
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

from modules.preprocess_text import tokenize_text

In [2]:
# Common construction tech FaQs

corpus = [
    "What are the key responsibilities of a quantity surveyor?", "How can I learn architectural design?", "What are the current trends in sustainable building technology?", "What skills are essential for a construction project manager?", "How do I get a construction manager job with no experience?", "What is Building Information Modeling (BIM)?", "What courses should I take for a career in civil engineering?", "How is drone technology used in construction?", "What are the latest innovations in construction safety?", "How can I get a job as an architect?", "What software is used for structural analysis?", "What is the role of a project manager in a construction project?", "How does 3D printing impact the construction industry?", "What certifications are beneficial for a career in construction?", "What are the benefits of using prefabrication in building?", "How do smart contracts and blockchain technology apply to construction?"
]

In [12]:
# three sample faqs
faq = corpus[:3]
faq

['What are the key responsibilities of a quantity surveyor?',
 'How can I learn architectural design?',
 'What are the current trends in sustainable building technology?']

In [21]:
# Count Vectorizer
vectorizer = CountVectorizer(tokenizer=tokenize_text)
X = vectorizer.fit_transform(faq)

values = X.toarray()
feature_names = vectorizer.get_feature_names_out()
pd.DataFrame(values, columns=feature_names)



Unnamed: 0,architectural,building,current,design,key,learn,quantity,responsibility,surveyor,sustainable,technology,trend
0,0,0,0,0,1,0,1,1,1,0,0,0
1,1,0,0,1,0,1,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,1,1,1


In [24]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize_text)
X = vectorizer.fit_transform(faq)

values = X.toarray()
feature_names = vectorizer.get_feature_names_out()
pd.DataFrame(values, columns=feature_names)



Unnamed: 0,architectural,building,current,design,key,learn,quantity,responsibility,surveyor,sustainable,technology,trend
0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.5,0.5,0.0,0.0,0.0
1,0.57735,0.0,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214,0.447214


In [27]:
# Building Tf-Idf Vectorizer Function

def build_vectorizer(corpus):
    """Builds a TF-IDF vectorizer from the given corpus.
    Args:
        corpus (list): A list of text documents.
    Returns:
        tuple: A tuple containing the TF-IDF matrix, feature names, and a DataFrame representation.
    """

    vectorizer = TfidfVectorizer(tokenizer=tokenize_text)
    X = vectorizer.fit_transform(corpus)

    values = X.toarray()
    feature_names = vectorizer.get_feature_names_out()
    dataframe = pd.DataFrame(values, columns=feature_names)
    return values, feature_names, dataframe



In [33]:
vectorized = build_vectorizer(corpus)
vectorized[2]



Unnamed: 0,analysis,apply,architect,architectural,beneficial,benefit,bim,blockchain,building,career,...,smart,software,structural,surveyor,sustainable,take,technology,trend,used,using
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379585,0.0,...,0.0,0.0,0.0,0.0,0.487111,0.0,0.379585,0.487111,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.526517,0.0,0.410292,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.399231,...,0.0,0.0,0.0,0.0,0.0,0.458425,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.479862,0.0,0.536279,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.630336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Import Build Vectorizer and Test


In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

from modules.preprocess_text import tokenize_text
from modules.preprocess_text import  build_vectorizer

In [2]:
# Test the build_vectorizer function
faq = [
    "What are the key responsibilities of a quantity surveyor?",
    "How can I learn architectural design?",
    "What are the current trends in sustainable building technology?"
]

vectorized = build_vectorizer(faq)
vectorized[2]  # Display the DataFrame representation of the vectorized corpus



Unnamed: 0,architectural,building,current,design,key,learn,quantity,responsibility,surveyor,sustainable,technology,trend
0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.5,0.5,0.0,0.0,0.0
1,0.57735,0.0,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214,0.447214
