In this notebook, I'll create the data file to use in a sentiment analysis pipeline.

The chosen dataset, the Stanford Sentiment Treebank, stores reviews snippets in a file called `dictionary.txt`, and their respective sentiment value in `sentiment_labels.txt`. Here, I shall combine these pieces of information in a dictionary that maps reviews to values, and then convert the dictionary in a .csv file.

In [1]:
import re
import csv

In [2]:
# creates review list that will contain dictionaries that map id to snippet
review_list = []
search = r'[\n]'

# opens the review file
with open('stanfordSentimentTreebank/dictionary.txt') as review_file:
    for row in review_file:
        string = str(row) # converts row into a string
        split_string = string.split('|') # splits the string in two parts (snippet and id)

        review_dict = {}
        # sets the second element of the string (id) as key
        review_dict['id'] = int(re.sub(search, '', split_string[1])) # id is followed by \n, which we want to strip
        # sets the first element of the string (snippet) as value
        review_dict['value'] = split_string[0]

        review_list.append(review_dict)

# sorts list of dicts by id value
review_list = sorted(review_list, key=lambda i: i['id'])

# modify dictionaries to only have one key
sorted_dict_list = []
for dict in review_list:
    id = dict['id']
    snippet = dict['value'] 
    dict[id] = snippet 

    del dict['id']
    del dict['value']
    
    sorted_dict_list.append(dict)

print(len(review_list))
print(len(sorted_dict_list))

239232
239232


In [4]:
# creates sentiment list that will contain dictionaries that map id to score
sentiment_list = []

# opens the sentiment value file
with open('stanfordSentimentTreebank/sentiment_labels.txt') as sentiment_file:
    for x, sent_row in enumerate(sentiment_file):
        sentiment_dict = {}

        if x != 0:
            sentiment_split_string = sent_row.split('|')

            id = int(sentiment_split_string[0]) # equivalent to dictionary keys
            # re.sub needs to be converted into string to be used as a dictionary key
            score = str(re.sub(search, '', sentiment_split_string[1]))

            # removes the old keys and replaces them with the corresponding sentiment value
            sentiment_dict['id'] = id
            sentiment_dict['score'] = score

            sentiment_list.append(sentiment_dict)

# modify dictionaries to only have one key
sorted_sentiment_list = []
for dict in sentiment_list:
    id2 = dict['id']
    score = dict['score'] 
    dict[id2] = score

    del dict['id']
    del dict['score']
    
    sorted_sentiment_list.append(dict)

print(len(sentiment_list))            
print(len(sorted_sentiment_list))

239232
239232


In [5]:
# extract scores from sorted_sentiment_list, stores them in a list
scores = []

for dict in sorted_sentiment_list:
    items = dict.items()

    for item in items:
        scores.append(item[1])
 
print(len(scores)) 
print(type(scores))

239232
<class 'list'>


In [6]:
# extract snippets from sorted_dict_list, stores them in a list
snippets = []

for dict in sorted_dict_list:
    items = dict.items()

    for item in items:
        # we need the quotes to make sure the string fits in one column of a .csv
        snippets.append(f'${item[1]}')
 
print(len(snippets)) 
print(type(snippets))

239232
<class 'list'>


In [7]:
# zips the two lists into a dictionary
final_dictionary = {k: v for k, v in zip(snippets, scores)}

I will now convert our dictionary into a .cvs file, which I will call `dataset.cvs`. The file will display the sentiment score on the second column and the associated snippet on the first column.

In [13]:
# decides on file name and extension
csv_file = "dataset.csv"

with open(csv_file, 'w', newline='') as csvFile:
    csvwriter = csv.writer(csvFile)
    csvwriter.writerow(['snippet', 'score'])
    for key, val in final_dictionary.items():
        csvwriter.writerow([key, val])