# Preprocess Combined Data
Using the combined QBJ as input, perform data preprocessing including:

1. Expand Contractions, Tokenize, and Convert to Lowercase
1. Remove Punctuation
1. Remove Stop Words
1. Parts of Speech (POS) Tagging
1. Lemmatize
1. Stemming
1. Create Bag of Words (BOW)
1. Calculate Term Frequency-Inverse Document Frequency (TF-IDF)
1. Sentencize
     1. Lemmatize Sentences
     1. Stem Sentences


In [1]:
! pip install --quiet --upgrade contractions
! pip install --quiet --upgrade nltk
! python -m nltk.downloader --quiet 'all'



In [2]:
import pandas as pd
import os
import csv

# Identify the working directory and data files
working_directory = "./16-Preprocess-Combined-Data"

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [3]:
# Readthe combined data into a dataframe
data_file = "./15-Combine-2020-2021-Stratified-Data/qbj_data_combined.csv"

# Read the data into a pandas dataframe
df = pd.read_csv(
    data_file,  # The data file being read, from the variable assignment above
    on_bad_lines="warn",  # This tells Pandas to only warn on bad lines vs causing an error
    dtype="str",
)  # This tells Pandas to treat all numbers as words

df.fillna("", inplace=True)

In [4]:
df.shape

(5736, 13)

In [5]:
df.head(2)

Unnamed: 0,ROW_ID,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,GENERIC_NAME,DEVICE_REPORT_PRODUCT_CODE,UDI-DI,UDI-PUBLIC,DATE_OF_EVENT,REPORTER_OCCUPATION_CODE,REPORT_DATE,EVENT_LOCATION,SOURCE_TYPE
0,1969025,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,,,07/30/2020,0,,I,CONSUMER
1,1426265,IT WAS REPORTED THAT SIGNAL LOSS OVER ONE HOUR...,3283,Wireless Communication Problem,CONTINUOUS GLUCOSE MONITOR,QBJ,386270000385.0,386270000385.0,06/05/2020,0,,I,CONSUMER


## Assign a Row ID for Verification
Assign a value to a variable that identifies a row from the dataset.  

This will allow the same row to be used for verification of each preprocessing step.

In [6]:
verification_row = 9

## Load the Natural Language Toolkit (NLTK) and Preprocessing Libraries

In [7]:
# Import the NLTK library
import nltk  # If this step fails, rerun 07-Install-NLTK.ipynb
import string
import contractions

## 1. Expand Contractions, Tokenize, and Convert to Lowercase

In [8]:
# This approach takes the FOI_TEXT as a string and creates a new column with tokens
# It removes contractions _and_ tokenizes at the same time
# No additional function is needed, x.split tokenizes the string (FOI text) at every space
# A call to lower() converts the word to lowercase

df["TOKENIZED_TEXT"] = df["FOI_TEXT"].apply(
    lambda x: [contractions.fix(word).lower() for word in x.split()]
)
df["TOKENIZED_TEXT"].head()

0    [it, was, reported, that, the, transmitter, lo...
1    [it, was, reported, that, signal, loss, over, ...
2    [it, was, reported, that, transmitter, failed,...
3    [it, was, reported, that, signal, loss, over, ...
4    [it, was, reported, that, signal, loss, over, ...
Name: TOKENIZED_TEXT, dtype: object

In [9]:
df["TOKENIZED_TEXT"][verification_row]

['it',
 'was',
 'reported',
 'that',
 'signal',
 'loss',
 'over',
 'one',
 'hour',
 'occurred.',
 'no',
 'product',
 'or',
 'data',
 'was',
 'provided',
 'for',
 'evaluation.',
 'confirmation',
 'of',
 'the',
 'allegation',
 'and',
 'a',
 'probable',
 'because',
 'could',
 'not',
 'be',
 'determined.',
 'no',
 'injury',
 'or',
 'medical',
 'intervention',
 'was',
 'reported.']

## 2. Remove Punctuation

In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
# Define a function to remove punctuation in the data
def remove_punctuation(text):
    text = "".join(
        [character for character in text if character not in string.punctuation]
    )
    return text


df["NOPUNCT_TEXT"] = df["TOKENIZED_TEXT"].apply(
    lambda x: [remove_punctuation(word) for word in x]
)
df["NOPUNCT_TEXT"].head()

0    [it, was, reported, that, the, transmitter, lo...
1    [it, was, reported, that, signal, loss, over, ...
2    [it, was, reported, that, transmitter, failed,...
3    [it, was, reported, that, signal, loss, over, ...
4    [it, was, reported, that, signal, loss, over, ...
Name: NOPUNCT_TEXT, dtype: object

In [12]:
df["NOPUNCT_TEXT"][verification_row]

['it',
 'was',
 'reported',
 'that',
 'signal',
 'loss',
 'over',
 'one',
 'hour',
 'occurred',
 'no',
 'product',
 'or',
 'data',
 'was',
 'provided',
 'for',
 'evaluation',
 'confirmation',
 'of',
 'the',
 'allegation',
 'and',
 'a',
 'probable',
 'because',
 'could',
 'not',
 'be',
 'determined',
 'no',
 'injury',
 'or',
 'medical',
 'intervention',
 'was',
 'reported']

## 3. Remove Stop Words

In [13]:
stopwords = nltk.corpus.stopwords.words("english")


# Define a function to convert to lowercase and remove stopwords
def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word.lower() not in stopwords]
    return text


df["NOSTOPWORDS_TEXT"] = df["NOPUNCT_TEXT"].apply(lambda x: remove_stopwords(x))
df["NOSTOPWORDS_TEXT"].head()

0    [reported, transmitter, lost, connection, pump...
1    [reported, signal, loss, one, hour, occurred, ...
2    [reported, transmitter, failed, error, occurre...
3    [reported, signal, loss, one, hour, occurred, ...
4    [reported, signal, loss, one, hour, occurred, ...
Name: NOSTOPWORDS_TEXT, dtype: object

In [14]:
df["NOSTOPWORDS_TEXT"][verification_row]

['reported',
 'signal',
 'loss',
 'one',
 'hour',
 'occurred',
 'product',
 'data',
 'provided',
 'evaluation',
 'confirmation',
 'allegation',
 'probable',
 'could',
 'determined',
 'injury',
 'medical',
 'intervention',
 'reported']

## 4. Parts of Speech (POS) Tagging

In [15]:
# Apply the nltk.pos_tag() function to each row of the TOKENIZED_TEXT column
# pos_tag returns a Tuple for each word consisting of the word and its classification
# TODO: List classifications and their abbreviations
df["POS_TEXT"] = df["NOSTOPWORDS_TEXT"].apply(nltk.pos_tag)
df["POS_TEXT"].head()

0    [(reported, VBN), (transmitter, NN), (lost, VB...
1    [(reported, VBN), (signal, JJ), (loss, NN), (o...
2    [(reported, VBN), (transmitter, NN), (failed, ...
3    [(reported, VBN), (signal, JJ), (loss, NN), (o...
4    [(reported, VBN), (signal, JJ), (loss, NN), (o...
Name: POS_TEXT, dtype: object

In [16]:
df["POS_TEXT"][verification_row]

[('reported', 'VBN'),
 ('signal', 'JJ'),
 ('loss', 'NN'),
 ('one', 'CD'),
 ('hour', 'NN'),
 ('occurred', 'VBD'),
 ('product', 'NN'),
 ('data', 'NNS'),
 ('provided', 'VBD'),
 ('evaluation', 'NN'),
 ('confirmation', 'NN'),
 ('allegation', 'NN'),
 ('probable', 'NN'),
 ('could', 'MD'),
 ('determined', 'VB'),
 ('injury', 'VB'),
 ('medical', 'JJ'),
 ('intervention', 'NN'),
 ('reported', 'VBD')]

## 5. Lemmatization

In [17]:
from nltk.stem import WordNetLemmatizer


# define a function to lemmatize each word in a text list based on its POS tag
def lemmatize_text(pos_tagged_text):
    # initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    # map NLTK's POS tags to WordNet's POS tags
    # TODO: list the abbreviations for WordNet's parts of speech
    pos_map = {"N": "n", "V": "v", "R": "r", "J": "a"}

    # lemmatize each word in the text list based on its POS tag
    lemmatized_text = []

    for word, pos in pos_tagged_text:
        # get the first character of the POS tag to use as the WordNet POS tag
        #
        # Set the WordNetLemmatizer default to Nouns ('n') or Verbs ('v')
        #
        wn_pos = pos_map.get(pos[0], "n")

        # lemmatize the word and append it to the lemmatized text list
        lemmatized_word = lemmatizer.lemmatize(word, pos=wn_pos)
        lemmatized_text.append(lemmatized_word)

    # return the lemmatized text list
    return lemmatized_text


# apply the lemmatize_text function to each row of the dataframe
df["LEMMATIZED_TEXT"] = df["POS_TEXT"].apply(lemmatize_text)
df["LEMMATIZED_TEXT"].head()

0    [report, transmitter, lose, connection, pump, ...
1    [report, signal, loss, one, hour, occur, produ...
2    [report, transmitter, fail, error, occur, data...
3    [report, signal, loss, one, hour, occur, revie...
4    [report, signal, loss, one, hour, occur, produ...
Name: LEMMATIZED_TEXT, dtype: object

## 6. Stemming

In [18]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


# define a function to stem each word in a text list
def stem_words(pos_tagged_text):
    stemmer = PorterStemmer()

    stemmed_text = []

    for word, pos in pos_tagged_text:
        # stem the word and append it to the stemmed text list
        stemmed_word = stemmer.stem(word)
        stemmed_text.append(stemmed_word)

    # return the stemmed text list
    return stemmed_text


df["STEMMED_TEXT"] = df["POS_TEXT"].apply(stem_words)
df["STEMMED_TEXT"].head()

0    [report, transmitt, lost, connect, pump, great...
1    [report, signal, loss, one, hour, occur, produ...
2    [report, transmitt, fail, error, occur, data, ...
3    [report, signal, loss, one, hour, occur, revie...
4    [report, signal, loss, one, hour, occur, produ...
Name: STEMMED_TEXT, dtype: object

## Compare the results of lemmatization and stemming

In [19]:
compare_lemma_stem_df = pd.DataFrame(
    {
        "WORD, PART OF SPEECH": df["POS_TEXT"][verification_row],
        "LEMMA": df["LEMMATIZED_TEXT"][verification_row],
        "STEM": df["STEMMED_TEXT"][verification_row],
    }
)

compare_lemma_stem_df = compare_lemma_stem_df.style.set_properties(
    **{"text-align": "left"}
)
compare_lemma_stem_df = compare_lemma_stem_df.set_table_styles(
    [dict(selector="th", props=[("text-align", "left")])]
)
compare_lemma_stem_df

Unnamed: 0,"WORD, PART OF SPEECH",LEMMA,STEM
0,"('reported', 'VBN')",report,report
1,"('signal', 'JJ')",signal,signal
2,"('loss', 'NN')",loss,loss
3,"('one', 'CD')",one,one
4,"('hour', 'NN')",hour,hour
5,"('occurred', 'VBD')",occur,occur
6,"('product', 'NN')",product,product
7,"('data', 'NNS')",data,data
8,"('provided', 'VBD')",provide,provid
9,"('evaluation', 'NN')",evaluation,evalu


## 7. Create Bag of Words (BOW)

In [20]:
## 7. Create Bag of Words (BOW)
from sklearn.feature_extraction.text import CountVectorizer

# create a CountVectorizer object
count_vectorizer = CountVectorizer()

# fit the vectorizer to the text data
count_vectorizer.fit(df["LEMMATIZED_TEXT"].apply(lambda x: " ".join(x)))

# create a bag of words matrix
bow_matrix = count_vectorizer.transform(
    df["LEMMATIZED_TEXT"].apply(lambda x: " ".join(x))
)

# convert the bag of words matrix to a DataFrame
bow_df = pd.DataFrame(
    bow_matrix.toarray(), columns=count_vectorizer.get_feature_names_out()
)

In [21]:
bow_df.shape

(5736, 1197)

In [22]:
bow_df.head()
# TODO: Plot the BOW results (?)

Unnamed: 0,021,021vdc,03142020,045,06142020,07032020,07312020,0v,0vdc,10,...,work,would,x2,xray,year,yellow,yes,yet,zero,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 8. Calculate Term Frequency-Inverse Document Frequency (TF-IDF)

In [23]:
## 8. Calculate Term Frequency-Inverse Document Frequency (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

# create a CountVectorizer object and fit it to the text data
tfidf_vectorizer = TfidfVectorizer()

X = tfidf_vectorizer.fit_transform(df["LEMMATIZED_TEXT"].apply(lambda x: " ".join(x)))

# convert the sparse matrix to a DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [24]:
tfidf_df.shape

(5736, 1197)

In [25]:
tfidf_df.head()

Unnamed: 0,021,021vdc,03142020,045,06142020,07032020,07312020,0v,0vdc,10,...,work,would,x2,xray,year,yellow,yes,yet,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 9. Sentencize
The `FOI_TEXT` can be processed as sentences.

For further analysis, each sentence needs to be associated with the `FOI_TEXT` row that it came from.

[This discussion from Stack Overflow](https://stackoverflow.com/a/43922444/2308522) provides a suggestion for breaking the code into a dataframe of sentences with each sentence retaining the ID of the row where it was originally located.

[This page from the Pandas documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.itertuples.html) provides details on using the `itertuples()` function to process the rows of the dataframe.

In [26]:
sentences = []

# Using itertuples(), the columns must be addressed using thier position.
# Here's a map of position to name:
# row[1]: ROW_ID
# row[2]: FOI_TEXT
# row[3]: DEVICE_PROBLEM_CODE
# row[4]: DEVICE_PROBLEM_TEXT
for row in df.itertuples():
    for sentence in row[2].split("."):
        if sentence != "":
            sentences.append([row[1], row[3], row[4], sentence])

sentences_df = pd.DataFrame(
    sentences,
    columns=[
        "ROW_ID",
        "DEVICE_PROBLEM_CODE",
        "DEVICE_PROBLEM_TEXT",
        "SENTENCIZED_FOI_TEXT",
    ],
)

compare_lemma_stem_df

Unnamed: 0,"WORD, PART OF SPEECH",LEMMA,STEM
0,"('reported', 'VBN')",report,report
1,"('signal', 'JJ')",signal,signal
2,"('loss', 'NN')",loss,loss
3,"('one', 'CD')",one,one
4,"('hour', 'NN')",hour,hour
5,"('occurred', 'VBD')",occur,occur
6,"('product', 'NN')",product,product
7,"('data', 'NNS')",data,data
8,"('provided', 'VBD')",provide,provid
9,"('evaluation', 'NN')",evaluation,evalu


In [27]:
sentences_df.shape

(25765, 4)

In [28]:
sentences_df.head(3)

Unnamed: 0,ROW_ID,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,SENTENCIZED_FOI_TEXT
0,1969025,3283,Wireless Communication Problem,IT WAS REPORTED THAT THE TRANSMITTER LOST CONN...
1,1969025,3283,Wireless Communication Problem,THE TRANSMITTER ULTIMATELY REGAINED CONNECTIO...
2,1969025,3283,Wireless Communication Problem,NO ADDITIONAL PATIENT OR EVENT INFORMATION WA...


In [29]:
sentences_df["SENTENCIZED_FOI_TEXT"][0]

'IT WAS REPORTED THAT THE TRANSMITTER LOST CONNECTION WITH THE PUMP FOR GREATER THAN 1 HOUR'

In [30]:
# Expand Contractions, Tokenize, and Convert to Lowercase
sentences_df["TOKENIZED_SENTENCES"] = sentences_df["SENTENCIZED_FOI_TEXT"].apply(
    lambda x: [contractions.fix(word).lower() for word in x.split()]
)

In [31]:
sentences_df["TOKENIZED_SENTENCES"][0]

['it',
 'was',
 'reported',
 'that',
 'the',
 'transmitter',
 'lost',
 'connection',
 'with',
 'the',
 'pump',
 'for',
 'greater',
 'than',
 '1',
 'hour']

In [32]:
# Remove punctuation
sentences_df["NOPUNCT_SENTENCES"] = sentences_df["TOKENIZED_SENTENCES"].apply(
    lambda x: [remove_punctuation(word) for word in x]
)
sentences_df["NOPUNCT_SENTENCES"][0]

['it',
 'was',
 'reported',
 'that',
 'the',
 'transmitter',
 'lost',
 'connection',
 'with',
 'the',
 'pump',
 'for',
 'greater',
 'than',
 '1',
 'hour']

In [33]:
# Remove stop words
sentences_df["NOSTOPWORDS_SENTENCES"] = sentences_df["NOPUNCT_SENTENCES"].apply(
    lambda x: remove_stopwords(x)
)
sentences_df["NOSTOPWORDS_SENTENCES"][0]

['reported',
 'transmitter',
 'lost',
 'connection',
 'pump',
 'greater',
 '1',
 'hour']

In [34]:
# Apply POS Tagging
sentences_df["POS_SENTENCES"] = sentences_df["NOSTOPWORDS_SENTENCES"].apply(
    nltk.pos_tag
)
sentences_df["POS_SENTENCES"][0]

[('reported', 'VBN'),
 ('transmitter', 'NN'),
 ('lost', 'VBN'),
 ('connection', 'NN'),
 ('pump', 'NN'),
 ('greater', 'JJR'),
 ('1', 'CD'),
 ('hour', 'NN')]

In [35]:
# Define a function to join tokens that have been lemmatized and stemmed
def join_tokenized_sentence(tokens):
    joined_words = []

    for word in tokens:
        joined_words.append(word)

    # Join the stemmed words back into a sentence
    return " ".join(joined_words)

## 9.A Lemmatize Sentences

In [36]:
sentences_df["TOKEN_LEMMATIZED_SENTENCES"] = sentences_df["POS_SENTENCES"].apply(
    lemmatize_text
)
sentences_df["TOKEN_LEMMATIZED_SENTENCES"][0]

['report', 'transmitter', 'lose', 'connection', 'pump', 'great', '1', 'hour']

In [37]:
sentences_df["LEMMATIZED_SENTENCES"] = sentences_df["TOKEN_LEMMATIZED_SENTENCES"].apply(
    join_tokenized_sentence
)
sentences_df["LEMMATIZED_SENTENCES"][0]

'report transmitter lose connection pump great 1 hour'

## 9.B Stem Sentences

In [38]:
# create a new column called 'STEMMED_SENTENCES'
sentences_df["TOKEN_STEMMED_SENTENCES"] = sentences_df["POS_SENTENCES"].apply(
    stem_words
)
sentences_df["TOKEN_STEMMED_SENTENCES"][0]

['report', 'transmitt', 'lost', 'connect', 'pump', 'greater', '1', 'hour']

In [39]:
sentences_df["STEMMED_SENTENCES"] = sentences_df["TOKEN_STEMMED_SENTENCES"].apply(
    join_tokenized_sentence
)
sentences_df["STEMMED_SENTENCES"][0]

'report transmitt lost connect pump greater 1 hour'

## Review the preprocessed data

In [40]:
# Create a new dataframe with just one row containing the column names
column_names_df = pd.DataFrame(
    {
        "DF COLUMN NAMES": df.columns,
    }
)

example = []

for col in df.columns:
    example.append(df[col][0])

column_names_df["EXAMPLE"] = example

# Format output so that the column headers and data are aligned to the left
column_names_df = column_names_df.style.set_properties(**{"text-align": "left"})
column_names_df = column_names_df.set_table_styles(
    [dict(selector="th", props=[("text-align", "left")])]
)
column_names_df

Unnamed: 0,DF COLUMN NAMES,EXAMPLE
0,ROW_ID,1969025
1,FOI_TEXT,IT WAS REPORTED THAT THE TRANSMITTER LOST CONNECTION WITH THE PUMP FOR GREATER THAN 1 HOUR. THE TRANSMITTER ULTIMATELY REGAINED CONNECTION WITH THE PUMP. NO ADDITIONAL PATIENT OR EVENT INFORMATION WAS AVAILABLE.
2,DEVICE_PROBLEM_CODE,3283
3,DEVICE_PROBLEM_TEXT,Wireless Communication Problem
4,GENERIC_NAME,CONTINUOUS GLUCOSE MONITOR
5,DEVICE_REPORT_PRODUCT_CODE,QBJ
6,UDI-DI,
7,UDI-PUBLIC,
8,DATE_OF_EVENT,07/30/2020
9,REPORTER_OCCUPATION_CODE,000


In [41]:
# Create a new dataframe with just one row containing the column names
column_names_df = pd.DataFrame(
    {
        "SENTENCES DF COLUMN NAMES": sentences_df.columns,
    }
)

example = []

for col in sentences_df.columns:
    example.append(sentences_df[col][0])

column_names_df["EXAMPLE"] = example

# Format output so that the column headers and data are aligned to the left
column_names_df = column_names_df.style.set_properties(**{"text-align": "left"})
column_names_df = column_names_df.set_table_styles(
    [dict(selector="th", props=[("text-align", "left")])]
)
column_names_df

Unnamed: 0,SENTENCES DF COLUMN NAMES,EXAMPLE
0,ROW_ID,1969025
1,DEVICE_PROBLEM_CODE,3283
2,DEVICE_PROBLEM_TEXT,Wireless Communication Problem
3,SENTENCIZED_FOI_TEXT,IT WAS REPORTED THAT THE TRANSMITTER LOST CONNECTION WITH THE PUMP FOR GREATER THAN 1 HOUR
4,TOKENIZED_SENTENCES,"['it', 'was', 'reported', 'that', 'the', 'transmitter', 'lost', 'connection', 'with', 'the', 'pump', 'for', 'greater', 'than', '1', 'hour']"
5,NOPUNCT_SENTENCES,"['it', 'was', 'reported', 'that', 'the', 'transmitter', 'lost', 'connection', 'with', 'the', 'pump', 'for', 'greater', 'than', '1', 'hour']"
6,NOSTOPWORDS_SENTENCES,"['reported', 'transmitter', 'lost', 'connection', 'pump', 'greater', '1', 'hour']"
7,POS_SENTENCES,"[('reported', 'VBN'), ('transmitter', 'NN'), ('lost', 'VBN'), ('connection', 'NN'), ('pump', 'NN'), ('greater', 'JJR'), ('1', 'CD'), ('hour', 'NN')]"
8,TOKEN_LEMMATIZED_SENTENCES,"['report', 'transmitter', 'lose', 'connection', 'pump', 'great', '1', 'hour']"
9,LEMMATIZED_SENTENCES,report transmitter lose connection pump great 1 hour


## Save the preproecssed data

In [42]:
df.to_csv(f"{working_directory}/preprocessed_data.csv", index=False)

bow_df.to_csv(f"{working_directory}/bag_of_words_data.csv", index=False)

tfidf_df.to_csv(f"{working_directory}/tfidf_data.csv", index=False)

sentences_df.to_csv(f"{working_directory}/sentences_data.csv", index=False)

## Upload All Output to an S3 Bucket

In [44]:
import os
import subprocess

# Create the upload command using the AWS command line interface
command = [
    "aws",
    "s3",
    "sync",
    working_directory,
    f"s3://praxis-2023-html-output",
    "--exclude",
    f"*/.ipynb_checkpoints/*",
    "--no-progress",
]

# Run the command and wait for it to complete
output = subprocess.run(command, capture_output=True, text=True)

# Print the command's output
print(output.stdout)
print("fin")


fin
