In [None]:
# Input: an array consisting of 100 word text sections
# Output: BOW vectorized sections

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd

In [8]:
def bag_of_words(texts):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray()



In [13]:
def n_grams(texts, n=2):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    X = vectorizer.fit_transform(texts)
    return X.toarray()



In [14]:
def tfidf(texts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray()



In [15]:
def lda(texts, n_topics=5):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    lda_model = LatentDirichletAllocation(n_components=n_topics)
    lda_features = lda_model.fit_transform(X)
    return lda_features

In [17]:
def load_df_from_json(json_file_path):
    try:
        df = pd.read_json(json_file_path)
        return df
    except Exception as e:
        print(e)
        return []


def get_string_list_from_df(df, column_name):
    try:
        # Convert column to list and ensure all elements are strings
        string_list = df[column_name].astype(str).tolist()
        return string_list
    except Exception as e:
        print(e)
        return []


In [24]:
tx = ["ces dae", "cis sda cis", "cis ar"]
print(bag_of_words(tx))
print(n_grams(tx, 2))

[[0 1 0 1 0]
 [0 0 2 0 1]
 [1 0 1 0 0]]
[[1 0 0 0]
 [0 0 1 1]
 [0 1 0 0]]
