### Import Google Spreadsheet

In [4]:
import gspread
from google.oauth2.service_account import Credentials
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
credentials = Credentials.from_service_account_file('./sheet-274815-b5805997d72c.json', scopes=scope)
gc = gspread.authorize(credentials)
sh = gc.open_by_url('https://docs.google.com/spreadsheets/d/1uUxI7ZwIO25GccHz_k6UNMtfvO6SwTuskNBdptV5vyw/edit?usp=sharing')
worksheet_list = sh.worksheets()

print(worksheet_list)

[<Worksheet 'Bachelor + Master' id:2056597316>, <Worksheet 'Master' id:0>, <Worksheet 'Bachelor' id:1109562875>]


### Select Degree

In [59]:
degree = 'Bachelor + Master'
sheet = sh.worksheet(degree)
print(sheet)
rows = sheet.get_all_values()

<Worksheet 'Bachelor + Master' id:2056597316>


### Create Dictionary

In [164]:
import time
import string
import json
import glob
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

dictionary = {}

for i in range(1, len(rows)):
    row = rows[i]
    
    try:
        author = row[0] + ' ' + row[1]
        course = row[2]
        thesisTitle = row[3]
        text = row[4]
        doc = nlp(text)
        
        for sentence in doc.sents:
            for token in sentence:
                word = str(token).lower()
                content = {"title": thesisTitle, "course": course ,  "occurency" : 1}
                
                if word in dictionary: 
                    if author in dictionary[word]:
                        wordCount = int(dictionary[word][author]["occurency"])
                        content = {"title": thesisTitle, "course": course ,  "occurency" : wordCount + 1}
                        dictionary[word][author] =  content
                        
                        generalCount = int(dictionary[word]["occurency"])
                        dictionary[word]["occurency"] =  generalCount + 1
                        
                    else:
                        dictionary[word][author] =  content 
                        
                        generalCount = int(dictionary[word]["occurency"])
                        dictionary[word]["occurency"] =  generalCount + 1
                else:
                    dictionary[word] = { "occurency" : 1 , author : content}
        
    except Exception as e:
        print(e)
        break
    

output = open("./dictionary-" + degree.lower() + ".json", 'w')
json.dump(dictionary, output)
output.close()

print("Done!")

Done!


### Sort by most occuring keys

In [203]:
clean = {}

for key in dictionary.keys():
    occurency = dictionary[key]["occurency"]
    clean[key] = occurency
    

{'the': 2353, 'movie': 3, 'is': 604, 'a': 1147, 'tale': 1, 'of': 1378, 'life': 49, 'and': 1277, 'spirit': 4, 'objects': 91, '.': 1727, 'in': 695, 'spite': 1, 'their': 156, 'very': 16, 'material': 58, 'physical': 32, 'quality': 11, ',': 1624, 'help': 12, 'us': 76, 'access': 10, 'an': 256, 'invisible': 4, 'world': 94, 'one': 82, 'way': 80, 'to': 1092, 'understand': 23, 'appreciate': 6, 'phenomena': 6, 'through': 102, 'reading': 4, 'study': 8, 'philosophy': 6, 'phenomenology': 1, 'more': 114, 'recently': 2, 'speculative': 4, 'realism': 1, 'granting': 1, 'agency': 2, 'just': 24, 'as': 330, 'importantly': 2, 'recognition': 4, 'object': 29, 'consciousness': 6, 'deeply': 4, 'rooted': 5, 'history': 27, 'making': 37, 'anthropology': 1, 'archeology': 1, 'particular': 4, 'demonstrate': 5, 'that': 364, 'has': 96, 'opened': 3, 'pandora': 1, "'s": 102, 'box': 2, 'cognition': 3, 'mould': 3, 'center': 4, 'point': 16, 'story': 14, 'allegory': 2, 'for': 288, 'design': 61, 'replacing': 2, 'simple': 8, 'm

In [247]:
output = open("daeWords2021.tsv", 'w')

output.write("word\toccurence\tmetadata\n")

for (key, value) in sorted(clean.items(), key=lambda x: x[1], reverse=True):
    
    infos = []
    
    for chiave in dictionary[key]:
        if chiave == "occurency":
            continue
            
        title = dictionary[key][chiave]["title"]
        course = dictionary[key][chiave]["course"]
        occurency = dictionary[key][chiave]["occurency"]

        if "..." in title:
            continue
        
        infos.append(["student: "+ chiave,  "title: " + title, "course: " + course, "occurency: " + str(occurency)])

    #print("chiave", key)
    #print("value" , value)
    #print("thesis", json.dumps(infos))
    
    output.write(key + "\t" + str(value) + "\t" + str(infos) + "\n")

output.close()

In [248]:
import csv
from xlsxwriter.workbook import Workbook

# Add some command-line logic to read the file names.

tsv_file = 'daeWords2021.tsv'
xlsx_file = 'daeWords2021.xlsx'

# Create an XlsxWriter workbook object and add a worksheet.
workbook = Workbook(xlsx_file)
worksheet = workbook.add_worksheet()

# Create a TSV file reader.
tsv_reader = csv.reader(open(tsv_file, 'rt'), delimiter='\t')

# Read the row data from the TSV file and write it to the XLSX file.
for row, data in enumerate(tsv_reader):
    worksheet.write_row(row, 0, data)

# Close the XLSX file.
workbook.close()