# Include Libraries 

In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string

# Processing Steps:
### Lowercasing the text
### Removing Stop Words from the text
### Removing Punctuation Marks from the text 
### Tokenization the text
### One-Hot Encoding for each token
### Embedding Layer

# Read Text Data Functions

In [8]:
def read_txt_file(file_path):
    """Read text data from a .txt file."""
    with open(file_path, 'r') as file:
        data = file.readlines()
    return data

def read_csv_file(file_path):
    """Read text data from a .csv file."""
    df = pd.read_csv(file_path)
    return df['text'].tolist()

def combine_datasets(*datasets):
    """Combine multiple datasets into one."""
    combined_data = []
    for dataset in datasets:
        combined_data.extend(dataset)
    return combined_data

# NLP Processing Functions

In [9]:
def preprocess_text(text):
    """Preprocess text data."""
    # Lowercasing the text
    text = text.lower()
    # Removing Stop Words from the text
    stopwords = ENGLISH_STOP_WORDS
    text = ' '.join([word for word in text.split() if word not in stopwords])
    # Removing Punctuation Marks from the text
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def tokenize_text(texts):
    """Tokenize the text."""
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(texts)
    tokenized_texts = tokenizer.texts_to_sequences(texts)
    return tokenized_texts, tokenizer

def pad_sequences(tokenized_texts, max_length):
    """Pad sequences to ensure uniform length."""
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(tokenized_texts, maxlen=max_length, padding='post')
    return padded_sequences

def embed_text(padded_sequences, input_dim, embedding_dim):
    """Embed the tokenized texts."""
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=input_dim, output_dim=embedding_dim)
    ])
    embedded_texts = model.predict(padded_sequences)
    return embedded_texts

def save_embedding(embeddings, file_path):
    """Save the embedding result as a numpy file."""
    np.save(file_path, embeddings)




# Example usage:

In [15]:
# Read text data from files
data_from_txt = read_txt_file('data.txt')
data_from_csv = read_csv_file('data.csv')

In [16]:
# Preprocess text data
preprocessed_data = [preprocess_text(text) for text in combine_datasets(data_from_txt, data_from_csv)]

In [17]:
# Tokenize the preprocessed text and get the vocabulary size
tokenized_texts, tokenizer = tokenize_text(preprocessed_data)
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for the padding token


In [18]:
# Pad sequences to ensure uniform length
max_length = max([len(tokens) for tokens in tokenized_texts])
padded_sequences = pad_sequences(tokenized_texts, max_length=max_length)

In [19]:
# Embed the padded sequences
embedded_texts = embed_text(padded_sequences, input_dim=vocab_size, embedding_dim=10)



In [22]:
embedded_texts[1]

array([[-4.13022935e-04,  2.82572843e-02, -2.99137719e-02,
         3.06217708e-02,  4.08312194e-02,  4.28198613e-02,
         7.62728602e-03,  1.42006949e-03,  4.89885323e-02,
        -2.43740808e-02],
       [ 2.81171240e-02,  2.10064910e-02,  4.34445627e-02,
         4.89004701e-03, -1.24944672e-02, -4.48822975e-05,
         1.16658807e-02, -3.01652085e-02,  4.38703187e-02,
         4.86597531e-02],
       [ 4.01971228e-02,  6.20117038e-03,  7.24303722e-03,
        -4.94319201e-02,  2.52356268e-02,  3.17774750e-02,
         1.73562057e-02, -1.08530298e-02, -2.85205133e-02,
        -4.95767705e-02],
       [ 1.23345740e-02,  1.25705861e-02,  2.74707712e-02,
         3.15669067e-02,  1.50453709e-02, -1.60153955e-03,
         1.55238248e-02, -1.23540163e-02, -2.34242529e-03,
        -2.55083088e-02],
       [-1.43895745e-02,  2.45445706e-02, -1.90626383e-02,
        -5.71340322e-03,  2.29596533e-02, -1.45327449e-02,
        -4.14418466e-02,  3.75697128e-02,  3.12371179e-03,
        -3.

In [20]:
# Save the embedding result as a numpy file
save_embedding(embedded_texts, 'embedding_result.npy')