# Vectorization and Feature Extraction

#### Import the Necessary libraries

In [1]:
#pandas and numpy for df manipulation
import pandas as pd
import numpy as np
import re

#visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#counter
from collections import Counter

#standard nlp libraries for text processing
import nltk

#gensim will not be used in this notebook, but it is a very popular and useful library
import gensim

#useful functions frequency vector functions 
from sklearn.feature_extraction.text import CountVectorizer #sklearn
from nltk.probability import FreqDist #nltk

#useful functions for tf-idf vectorization
from sklearn.feature_extraction.text import TfidfVectorizer #sklearn

#ngrams
from nltk.util import ngrams #nltk

#### Import the preprocessed data

In [2]:
path = 'C:\\Users\\ritaf\\Documents\\ESTUDOS\\NOVA IMS\\3º Ano\\Text Mining\\Projeto\\new_data\\'

# Read the CSV file into a DataFrame
data = pd.read_csv(path + 'text_train_btp.csv')


#### Shortening the dataset

In [36]:
np.random.seed(42)

# Get the number of rows in the original DataFrame
num_rows_original = len(data)

# Calculate the number of rows you want in the new DataFrame (half of the original)
num_rows_new = 200

# Randomly select half of the rows
random_indices = np.random.choice(num_rows_original, size=num_rows_new, replace=False)

# Create a new DataFrame with the randomly selected rows
new_dataframe = data.iloc[random_indices]

# Reset the index of the new DataFrame
new_dataframe = new_dataframe.reset_index(drop=True)

# Now 'new_dataframe' contains half of the rows randomly selected from 'original_dataframe'
new_dataframe.head()

Unnamed: 0,title,artist,features,lyrics,tag,title_token,lyrics_token,title_token_filtered,lyrics_token_filtered,title_filtered_string,lyrics_filtered_string,title_string_fdist,lyrics_string_fdist
0,rollercoaster live acoustic 2008,direct nld,,rollercoasteryou can tell me what is true and ...,pop,"['rollercoaster', 'live', 'acoustic', '2008']","['rollercoasteryou', 'can', 'tell', 'me', 'wha...","['rollercoaster', 'live', 'acoustic', '2008']","['rollercoasteryou', 'tell', 'true', 'things',...",rollercoaster live acoustic 2008,rollercoasteryou tell true things gotta doto s...,rollercoaster live acoustic 2008,tell true things doto sure safe danger wayi te...
1,haunted lust,the happy fits,,verse 1feel my haunted lust creep in our desir...,rock,"['haunted', 'lust']","['verse', '1feel', 'my', 'haunted', 'lust', 'c...","['haunted', 'lust']","['verse', '1feel', 'haunted', 'lust', 'creep',...",haunted lust,verse 1feel haunted lust creep desirebelieving...,haunted lust,verse 1feel haunted lust creep seems make mind...
2,further in summer than the birds 1068,emily dickinson,,further in summer than the birdspathetic from ...,misc,"['further', 'in', 'summer', 'than', 'the', 'bi...","['further', 'in', 'summer', 'than', 'the', 'bi...","['summer', 'birds', '1068']","['summer', 'birdspathetic', 'grassa', 'minor',...",summer birds 1068,summer birdspathetic grassa minor nation celeb...,summer birds,summer grassa minor nation unobtrusive ordinan...
3,mr commissioner,arbee stidham,,introhold it hold it everybody hold it the mee...,rb,"['mr', 'commissioner']","['introhold', 'it', 'hold', 'it', 'everybody',...","['mr', 'commissioner']","['introhold', 'hold', 'everybody', 'hold', 'me...",commissioner,introhold hold everybody hold meeting come ord...,,introhold hold everybody hold meeting come tel...
4,lord knows,elementim,,yeahelementimyeah yeah yeah yeahon my toes on ...,rap,"['lord', 'knows']","['yeahelementimyeah', 'yeah', 'yeah', 'yeahon'...","['lord', 'knows']","['yeahelementimyeah', 'yeah', 'yeah', 'yeahon'...",lord knows,yeahelementimyeah yeah yeah yeahon toes grind ...,lord knows,yeah yeah yeahon toes grind going hard like nu...


#### Extracting the lyrics

In [37]:
lyr = new_dataframe['lyrics'].to_list()
#lyr_f = data['lyrics_filtered_string'].to_list()
#lyr_fd = data['lyrics_string_fdist'].to_list()

lyr

['rollercoasteryou can tell me what is true and all the things i gotta doto be sure to be safe for the danger on the wayi can tell you bout my life andall the things that i have trieddoing good being straight being careful every daygo up and down round and round on a rollercoasteri close my eyes and i fly on my rollercoaster riderolling on my rollercoastergot another point of view in all the things i wanna doim not stupid im not blind just dont wanna waste my timeyou can tell me what is wise cause life is just one big surpriseso i go with the flow what will whatll happen i dont knowgo up and down round and round on a rollercoasteri close my eyes and i fly on my rollercoaster riderolling on my rollercoastergetting sick or tired going slower for a whilesoon ill start again rollin all along the bendshappy on my rollercoastergo up and down round and round on a rollercoasteri close my eyes and i fly on my rollercoaster ridego up and down round and round on a rollercoasteri close my eyes and

#### Bag of Words + Remove stopwords

In [38]:
def tokenized_frequency_vector(list_of_strings):
    '''Takes as input a list of strings of text and returns a list of dictionaries representing the frequency of each tokenized word'''

    # Create a tokenizer that only captures letters and numbers
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

    # Initialize an empty list to store the frequency vectors
    frequency_vectors = []

    # Process each string in the input list
    for text in list_of_strings:
    
        # Tokenize the text data - converted to string to avoid issues with numbers or other data types
        tokens = tokenizer.tokenize(str(text))

        # Convert the tokens to lowercase
        tokens = [token.lower() for token in tokens]

        # Remove stop words
        #1. Create a set of stop words
        stop_words = set(nltk.corpus.stopwords.words('english')) #CODE HERE
    
        #remove stop words from the tokenized text
        tokens = [token for token in tokens if token not in stop_words] #CODE HERE

        # Create a frequency distribution for the tokenized text data
        fdist = FreqDist(tokens)

        # Append the frequency vector to the list of frequency vectors
        frequency_vectors.append(dict(fdist))

    return frequency_vectors

In [39]:
messages_freq_vectors = tokenized_frequency_vector(lyr)
messages_freq_vectors

[{'rollercoasteryou': 1,
  'tell': 3,
  'true': 1,
  'things': 3,
  'gotta': 1,
  'doto': 1,
  'sure': 1,
  'safe': 1,
  'danger': 1,
  'wayi': 1,
  'bout': 1,
  'life': 2,
  'andall': 1,
  'trieddoing': 1,
  'good': 1,
  'straight': 1,
  'careful': 1,
  'every': 1,
  'daygo': 1,
  'round': 8,
  'rollercoasteri': 4,
  'close': 4,
  'eyes': 4,
  'fly': 4,
  'rollercoaster': 5,
  'riderolling': 2,
  'rollercoastergot': 1,
  'another': 1,
  'point': 1,
  'view': 1,
  'wanna': 2,
  'doim': 1,
  'stupid': 1,
  'im': 1,
  'blind': 1,
  'dont': 2,
  'waste': 1,
  'timeyou': 1,
  'wise': 1,
  'cause': 1,
  'one': 1,
  'big': 1,
  'surpriseso': 1,
  'go': 1,
  'flow': 1,
  'whatll': 1,
  'happen': 1,
  'knowgo': 1,
  'rollercoastergetting': 1,
  'sick': 1,
  'tired': 1,
  'going': 1,
  'slower': 1,
  'whilesoon': 1,
  'ill': 1,
  'start': 1,
  'rollin': 1,
  'along': 1,
  'bendshappy': 1,
  'rollercoastergo': 1,
  'ridego': 1,
  'ridehappy': 1},
 {'verse': 1,
  '1feel': 1,
  'haunted': 2,
  'lu

In [40]:
messages_freq_df_1 = pd.DataFrame(messages_freq_vectors)

# Sort the columns by their sums in descending order
sorted_cols = messages_freq_df_1.sum().sort_values(ascending=False).index

# Reorder the columns of the DataFrame
messages_freq_df_1 = messages_freq_df_1[sorted_cols]

#this creates a sparse matrix
messages_freq_df_1

Unnamed: 0,im,like,know,dont,got,love,get,cant,never,go,...,emthey,daddy,painbut,amthey,1but,racinput,facinim,lossesrefrainif,movelosses,gettogethers
0,1.0,,,2.0,,,,,,1.0,...,,,,,,,,,,
1,1.0,1.0,1.0,,,2.0,,2.0,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,1.0,20.0,1.0,,1.0,,,,...,,,,,,,,,,
4,5.0,5.0,1.0,,1.0,,6.0,,,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2.0,1.0,,1.0,,,,,,,...,,,,,,,,,,
196,,,,,,,,,,,...,,,,,,,,,,
197,7.0,,1.0,1.0,,7.0,,10.0,3.0,,...,,,,,,,,,,
198,,,,,,,,,,,...,,,,,,,,,,


#### Bag of Words

In [41]:
def count_vectorizer_to_df(list_of_strings):
    
    # Create an instance of the CountVectorizer class - Default vectorizer does not remove stop words
    vectorizer = CountVectorizer()

    # Fit the vectorizer to the text data and transform the text data into a frequency matrix
    frequency_matrix = vectorizer.fit_transform(list_of_strings)

    # Convert the frequency matrix to a Pandas DataFrame
    df = pd.DataFrame(frequency_matrix.toarray(), columns=vectorizer.get_feature_names()) # O QUE AQUI ESTAVA ERA 'get_feature_names_out(), MAS NÃO FUNCIONAVA...'

    return df

In [43]:
messages_dtm = count_vectorizer_to_df(lyr)

# Sort the columns by their sums in descending order
sorted_cols = messages_dtm.sum().sort_values(ascending=False).index

# Reorder the columns of the DataFrame
messages_dtm = messages_dtm[sorted_cols]

#this creates matrix where nans are already filled with 0s
messages_dtm



Unnamed: 0,the,you,to,my,and,me,it,in,of,that,...,heartstopping,heartring,heartplease,heartlessyoull,heartits,heartin,hearthat,heartdad,heartby,zoneround
0,7,1,1,14,13,2,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,3,22,1,6,8,1,23,7,1,8,...,0,0,0,0,0,0,0,0,0,0
2,4,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,17,27,4,0,5,0,5,1,14,0,...,0,0,0,0,0,0,0,0,0,0
4,10,0,1,7,0,1,5,2,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,6,11,5,3,5,6,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
196,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
197,3,12,5,12,4,7,3,8,2,3,...,0,0,0,0,0,0,0,0,0,0
198,4,2,10,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### One Hot Encoding

In [44]:
def oh_count_vectorizer_to_df(list_of_strings):
    
    # Create an instance of the CountVectorizer class - Default vectorizer does not remove stop words
    vectorizer = CountVectorizer(binary = True)

    # Fit the vectorizer to the text data and transform the text data into a onehot encoded matrix
    ohe_matrix = vectorizer.fit_transform(list_of_strings)

    # Convert the frequency matrix to a Pandas DataFrame
    df = pd.DataFrame(ohe_matrix.toarray(), columns=vectorizer.get_feature_names()) #get_feature_names_out()

    return df

In [45]:
messages_dtm_one_hot = oh_count_vectorizer_to_df(lyr)

#check if there any rows with any value greater than 1
messages_dtm_one_hot.max(axis=1).value_counts()



1    200
dtype: int64

#### TF-IDF

In [46]:
def compute_tf(list_of_strings):
    '''Computes the term frequency for each word in a list of messages - TF looks at importance of a word in a document (message)'''

    # Create a tokenizer that only captures letters and numbers
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

    #set stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    # Initialize an empty list to store the term frequencies
    tf = []

    for text in list_of_strings:
        
        # Tokenize the text data - converted to string to avoid issues with numbers or other data types
        tokens = tokenizer.tokenize(str(text))

        #convert to lowercase and remove stopwords
        tokens = [token.lower() for token in tokens]
        tokens = [token for token in tokens if token not in stop_words]
        
        # Count the occurrences of each token
        token_counts = Counter(tokens)

        # Compute the term frequency for each token in the message
        tf_dict = {token: count / len(tokens) for token, count in token_counts.items()}
        tf.append(tf_dict)

    return tf

In [47]:
def compute_idf(list_of_strings):
    '''takes as input a list of messages and returns a dictionary containing the IDF for each unique word across all of the messages in the input list'''
    
    # Create a tokenizer that only captures letters and numbers
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

    #set stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    # Initialize the IDF dictionary
    idf = {}

    #Step 1 - Compute the total number of documents
    n_docs = len(list_of_strings)
    
    #Step 2 - Count the number of documents containing each term
    for text in list_of_strings:

        # Tokenize the text data - converted to string to avoid issues with numbers or other data types
        tokens = tokenizer.tokenize(str(text))

        #convert to lowercase and remove stopwords
        tokens = [token.lower() for token in tokens]
        tokens = [token for token in tokens if token not in stop_words]

        #Count 1 occurrence of each unique word appearing in the message
        token_counts = Counter(set(tokens))
        
        # Update the IDF dictionary - add the counts for this document to the existing counts for the token
        for token, count in token_counts.items():
            idf[token] = idf.get(token, 0) + count
    
    # Compute the IDF for each term - natural log of the quotient between the number of documents and the number of documents containing the term
    for token, count in idf.items():
        idf[token] = np.log(n_docs / count)
    return idf

In [48]:
def compute_tfidf(list_of_strings):

    # Compute the term frequency for each message
    tf = compute_tf(list_of_strings)
    
    # Compute the inverse document frequency for each term in the corpus
    idf = compute_idf(list_of_strings)
    
    # Initialize the TF-IDF list
    tfidf = []
    
    # Compute the TF-IDF for each message - tf has the same length as the number ofmessages
    for tf_dict in tf:

        # Initialize an empty dictionary to store the TF-IDF values for this message
        tfidf_dict = {}

        # Compute the TF-IDF value for each token in the message
        for token, tf_val in tf_dict.items():
            tfidf_dict[token] = tf_val * idf[token]
        
        # Append the TF-IDF dictionary for this message to the list of dicts - each dict having a Token - TF-IDF value for all tokens in each message
        tfidf.append(tfidf_dict)
    
    return tfidf

In [49]:
messages_tfidf_df = pd.DataFrame(compute_tfidf(lyr))
messages_tfidf_df.tail(10)

Unnamed: 0,rollercoasteryou,tell,true,things,gotta,doto,sure,safe,danger,wayi,...,greatuhhuh,dipped,nowyeah,stuff,hiphopdo,anythingalright,youdrinks,icefor,daytime,gettogethers
190,,,,,,,,,,,...,,,,,,,,,,
191,,,,,,,,,,,...,,,,,,,,,,
192,,,,,,,,,,,...,,,,,,,,,,
193,,,,,,,,,,,...,,,,,,,,,,
194,,,,,,,,,,,...,,,,,,,,,,
195,,0.049799,,,,,0.041435,,,,...,,,,,,,,,,
196,,,,,,,,,,,...,,,,,,,,,,
197,,,,,,,,,,,...,,,,,,,,,,
198,,,,,,,,,,,...,,,,,,,,,,
199,,,,,0.014614,,,,,,...,0.035799,0.035799,0.035799,0.035799,0.035799,0.035799,0.035799,0.035799,0.035799,0.035799


#### TF-IDF sklearn

In [50]:
def tf_idf_to_df(list_of_strings):
    
    # Create an instance of the TdidfVectorized class - Default vectorizer does not remove stop words
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer to the text data and transform the text data into a frequency matrix
    frequency_matrix = vectorizer.fit_transform(list_of_strings)

    # Convert the frequency matrix to a Pandas DataFrame
    df = pd.DataFrame(frequency_matrix.toarray(), columns=vectorizer.get_feature_names()) #get_feature_names_out()

    return df

In [51]:
messages_tfidf_sklearn = tf_idf_to_df(lyr)

#check descriptive statistics of df - allows us to see how sparse the matrix is
messages_tfidf_sklearn.describe()



Unnamed: 0,011maybe,033in,055i,10,100,106all,10quoted,10th,10thdo,10thi,...,yuhwhat,yxng,zen,zerosand,zipi,zipperchorus,zippy,zone,zonebeen,zoneround
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,...,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,0.000349,0.000349,0.000349,0.001001,0.000433,0.000349,0.000606,0.000709,0.000354,0.000709,...,0.000285,0.000727,0.000589,0.000225,0.000253,0.000159,0.000254,0.000262,0.0003,0.000221
std,0.004939,0.004939,0.004939,0.008446,0.004319,0.004939,0.008574,0.010025,0.005012,0.010025,...,0.004037,0.010283,0.008325,0.003189,0.00358,0.002254,0.003595,0.003704,0.004239,0.003132
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.069851,0.069851,0.069851,0.101456,0.044711,0.069851,0.121261,0.141774,0.070887,0.141774,...,0.057093,0.145421,0.117737,0.045098,0.050623,0.031879,0.050845,0.052376,0.059955,0.044297
