# Individualize Reviews from Bulk CSV to JSON
This script is designed to create a set of files that can be sent to blob or other bulk storage in JSON format from the bulk CSVs, with each review as a valid JSON file containing one record.

In [47]:
import os
import pandas as pd
import glob
import spacy

In [48]:
os.chdir('../csv')

In [49]:
csv_dataframes = []
list_of_csvs = glob.glob('*.csv')
for csv_file in list_of_csvs:
    csv_dataframes.append( pd.read_csv(csv_file))

In [50]:
column_names = ['r_id', 'of_title', 'r_date', 'score', 'review', 'is_english']
for review_set in csv_dataframes:
    review_set.columns = column_names

In [51]:
for review_set in csv_dataframes:
    review_set['r_date'] = pd.to_datetime(review_set['r_date'])

In [52]:
# initialize the empty lists for the records to be transformed later
spacy_tokens_list_of_lists = [ ]
spacy_filtered_tokens_list_of_lists = [ ]
list_of_lemma_lists = [ ]
sentiment_polarity_list = [ ]
lists_of_named_entities = [ ] 

In [None]:
nlp = spacy.load("en_core_web_sm")
for review_set in csv_dataframes:
    for index, review in review_set.iterrows():
        try:
            text_to_analyze_1 = review['review']
            analyze_1 = nlp(text_to_analyze_1)
            # create a list of all tokens for this individual review
            token_list = [token for token in analyze_1]
            # add the tokens to the master list of list of unfiltered tokens
            spacy_tokens_list_of_lists += token_list
            # remove stop words 
            filtered_tokens = [token for token in analyze_1 if not token.is_stop]
            spacy_filtered_tokens_list_of_lists += filtered_tokens
            lemmas = [
                f'Token: {token}, lemma: {token.lemma_}'
                for token in filtered_tokens
            ]
            list_of_lemma_lists += lemmas 
        except Exception as e:
            print(e)