In [14]:
import pandas as pd
import numpy as np
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load('en_core_web_lg')


In [15]:
def get_doc_vector(text):
  doc = nlp(text)
  return doc.vector

In [16]:
df_model = pd.read_csv('../data/data_2_after_tokenization_lemmatization.csv')

df_model['doc_vector'] = df_model['cleaned_text'].apply(get_doc_vector) # use spacy to vectorize contents

# construct Tf Idf model with maximum 5000 features, capture only unigram
tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=10000) 
tfidf_matrix = tfidf.fit_transform(df_model['cleaned_text']) # vectorize the texts
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(),
                        columns=tfidf.get_feature_names_out()) # construct a new df for features extracted using tfidf

df_doc_vectors = pd.DataFrame(df_model['doc_vector'].tolist())

df_tfidf.reset_index(drop=True, inplace=True)
df_doc_vectors.reset_index(drop=True, inplace=True)


df_combined_vectors = pd.concat([df_tfidf, df_doc_vectors], axis=1) # combine spacy vectors and tfidf vectors

In [17]:
df_model.shape

(2165, 5)

In [18]:
df_combined_vectors.shape

(2165, 10300)

In [19]:
df_combined_vectors.to_csv('../data/X_features_unigrams.csv', index=False)