# Word Properties

This notebook will demonstrate how to use dictionaries to call in lexical values for words and calculate average scores across texts for

    - Frequency
    - Concreteness

In [1]:
# Call in packages

import spacy

nlp = spacy.load("en_core_web_sm")

import pandas as pd

import csv


In [2]:
# Load the Drive helper
from google.colab import drive

# Below will prompt for authorization but it will make your google drive available (i.e., mount your drive).
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# After executing the cell above, Drive files will be present in "/content/drive/My Drive".
!ls "/content/drive/My Drive" #(ls is list directory)

'April 15th Update.gslides'
 archive
'Colab Notebooks'
'CS5891 Course Project Initial Proposal.gdoc'
'CS5891 final presentation _Yan Zhang, Kun Peng, Jules Zou.gslides'
'CS5891-final report.gdoc'
'CS5891 mid status presentation _Yan Zhang, Kun Peng, Jules Zou.gslides'
'CS5891 paper presentation.gslides'
'CS5891 project proposal.gslides'
 CS8395-08.gdoc
 dataset
'from __future__ import absolute_import'$'\n''from __future__ import division'$'\n''from __future__ import print_function'$'\n\n''import _init_p~01'\'')'$'\n''        numers = [1, 1, 3, 5, 35]'$'\n''        denoms = [1, 6, 40, 112, 1152]'$'\n''        ans = fractions.Fraction('\''0'\'')'$'\n''        .gdoc'
'Kun Peng'
'Kun Peng-CS8395-hw1 discussion session.gdoc'
 main.py
'Outline for Final paper.gdoc'
'Paper Outline.gdoc'
'PengKun182024TaxDocs (1).gdoc'
 PengKun182024TaxDocs.gdoc
 PengKun182024TaxDocs.pdf
 PengKun932023TaxDocs.pdf
'Project proposal.gdoc'
 renate_config.py
 results
'Screen Recording 2023-11-15 at 11.29.13 AM.mov

In [4]:
#find out where you are and move to correct location
import os #package for figuring out operating system

os.getcwd() #what is the current working directory

#os.listdir() #what is in currrent working directory

os.chdir("/content/drive/MyDrive/Colab Notebooks/DS_5780_spring_25/word_properties") #change directory

os.listdir() #dataframes are there

['MRC_database.csv', 'hyper_poly_dataframe.csv', 'elp_database.csv']

## Create practice dataframe for texts

In [5]:
# initialize list with practice sentence
prac_sent = [
    "The diminutive canine frolicked with unbridled exuberance on the lush, verdant lawn.",
    "The friendly dog played joyfully on the green, grassy field.",
    "The small pet ran quickly around on the soft, green sujim grass."
    ]

# Create the pandas DataFrame with column name
prac_df = pd.DataFrame(prac_sent, columns=['text'])

#lower case everything
#prac_df['text'] = prac_df['text'].str.lower()

prac_df['frequency'] = ["infreq", "normal", "freq"]

# print dataframe
prac_df

Unnamed: 0,text,frequency
0,The diminutive canine frolicked with unbridled...,infreq
1,"The friendly dog played joyfully on the green,...",normal
2,"The small pet ran quickly around on the soft, ...",freq


In [6]:
#spaCy texts

prac_df_docs = list(nlp.pipe(prac_df.text))

## Read in DF as Dictionary

We will start with the ELP dataframe

    - elp_database.csv

In [7]:

# Load the dataframe and use the first column ("#Word") as the index
elp_df = pd.read_csv('elp_database.csv', index_col=0)

# To get a list of values for each word
# First transpose with .T to make each word a column
# Then send the list of values under each column to a dictionary
elp_dict = elp_df.T.to_dict('list')

# Notice the eror message -- one word is repeated in the dataframe
# This could be a helpful warning message, but we will ignore it
print("'ah' is repeated")
display(elp_df[elp_df.index.duplicated(keep=False)])

  elp_dict = elp_df.T.to_dict('list')


'ah' is repeated


Unnamed: 0_level_0,Freq_HAL,Log_Freq_HAL,Ortho_N,Phono_N,Phono_N_H,OG_N,OG_N_H,Freq_N,Freq_N_P,Freq_N_PH,...,BG_Mean,I_Mean_RT,I_Zscore,I_SD,I_Mean_Accuracy,I_NMG_Mean_RT,I_NMG_Zscore,I_NMG_SD,I_NMG_Mean_Accuracy,Conc.M
#Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ah,1,0.0,2,1,1,1,2.08,3.48,2.08,2.08,...,2234.0,,,,,,,,,
ah,22390,10.02,15,17,17,4,9.71,10.79,9.75,9.75,...,226.0,829.48,0.07,397.16,0.79,619.65,-0.58,119.68,1.0,


In [8]:
# Notice that the types are floats here

print(elp_dict["happy"])

[70881.0, 11.17, 6.0, 9.0, 9.0, 4.0, 5.3, 4.67, 5.38, 5.38, 5.3, 5.3, 1.8, 7.2, 1.75, 7.32, 1323.5, 536.27, -0.88, 82.14, 1.0, 555.93, -0.75, 97.35, 1.0, 2.56]


In [9]:
# Read .csv into a dictionary

elp_dict = {} #empty dictionary

with open('elp_database.csv', 'r') as file: #read in file
    reader = csv.reader(file) # create a csv.reader
    next(reader)  # skip header
    for row in reader:
        key = row[0] # assign first element as key
        values = row[1:] # assign rest as values
        elp_dict[key] = values # add key and values to dictionary

In [10]:
print(len(elp_dict)) # number of words in dictionary

79671


In [11]:
# what do the values for "happy" look like?

for key, val in elp_dict.items():
    if key == "happy":
        print(val)

# note that the values are strings because csv.reader doesn't infer data types, and CSV files do not contain type annotations

print("==" *40)

# this approach is more efficient because Python does not go through every key...
value = elp_dict.get('happy')
print(value)

['70881', '11.17', '6', '9', '9', '4', '5.3', '4.67', '5.38', '5.38', '5.3', '5.3', '1.8', '7.2', '1.75', '7.32', '1323.5', '536.27', '-0.88', '82.14', '1', '555.93', '-0.75', '97.35', '1', '2.56']
['70881', '11.17', '6', '9', '9', '4', '5.3', '4.67', '5.38', '5.38', '5.3', '5.3', '1.8', '7.2', '1.75', '7.32', '1323.5', '536.27', '-0.88', '82.14', '1', '555.93', '-0.75', '97.35', '1', '2.56']


In [12]:
# print first five keys and values

i = 0
for key, val in elp_dict.items(): #for key and value
    print(f"Key: {key}, \nValues: {val}") # print them up
    i += 1 # add 1 to counter
    if i == 5: #when counter = 5
        break # stop

print("==" * 80)

# Or with enumerate which Langdon says is more Pythonic
for i, (key, val) in enumerate(elp_dict.items()):
    print(f"Key: {key}, \nValues: {val}")
    if i == 4:
        break

Key: ALF, 
Values: ['928', '6.83', '7', '8', '8', '3', '7.04', '8.66', '7.12', '7.12', '7.04', '7.04', '', '', '', '', '3042.5', '', '', '', '', '', '', '', '', '']
Key: ALFA, 
Values: ['849', '6.74', '6', '4', '8', '4', '5.65', '6.27', '5.65', '6.62', '5.65', '5.65', '', '', '', '', '2317.67', '', '', '', '', '', '', '', '', '']
Key: alfresco, 
Values: ['5', '1.61', '0', '0', '0', '0', '', '', '', '', '', '', '', '', '', '', '4917.71', '', '', '', '', '', '', '', '', '']
Key: algebraical, 
Values: ['2', '0.69', '0', '0', '0', '0', '', '', '', '', '', '', '', '', '', '', '3199.1', '', '', '', '', '', '', '', '', '']
Key: algebras, 
Values: ['156', '5.05', '0', '0', '0', '0', '', '', '', '', '', '', '', '', '', '', '2742', '', '', '', '', '', '', '', '', '']
Key: ALF, 
Values: ['928', '6.83', '7', '8', '8', '3', '7.04', '8.66', '7.12', '7.12', '7.04', '7.04', '', '', '', '', '3042.5', '', '', '', '', '', '', '', '', '']
Key: ALFA, 
Values: ['849', '6.74', '6', '4', '8', '4', '5.65', '6.

Note, we have a **problem** here!

What is it?

Also, the solution, is **always examine your data!!**

In [13]:

#what do the values for "french" look like?

value = elp_dict.get('french')
print(value)


#hmmmmmm.... none. that's weird.

None


In [14]:

#what do the values for "French" look like?

value = elp_dict.get('French')
print(value)

#Ahh.... so French is capitalized...

#why is this a problem?

['36635', '10.51', '3', '3', '3', '2', '5.85', '6.31', '7.66', '7.66', '5.85', '5.85', '1.85', '7.24', '1.85', '7.37', '4412.4', '589.77', '-0.75', '219.63', '0.97', '601.93', '-0.52', '111.02', '1', '']


In [15]:
#let's lowercase all the keys in the dictionary

for key in list(elp_dict.keys()): #loop through each key in dictionary
    elp_dict[key.lower()] = elp_dict.pop(key)
    # elp_dict.pop(key) = remove original key that might have uppercase letters
    # elp_dict[key.lower()] = lowercase the key

#now, go back up and check on "French" and "french"


In [16]:
# safe divide function to stop zero counts for words from causing problems
# some texts may have zero words after pre-processing

def safe_divide(a, b):
    if b != 0:
        return a/b
    else:
        return 0

In [17]:

#this is for all words... we may consider removing stop words when the time comes
#this is also by word, we may want to consider doing it by lemma
#for instance, the word "played" has no concreteness score, but "play", the lemma, does

final_conc = [] #concreteness (will become a list of lists)
final_freq = [] #frequency
final_nw = [] #number of words

for doc in prac_df_docs:
  print(doc)
  tokens = [] #holder list for each text in docs
  hal_freq = []
  conc = []
  for token in doc:
    if token.is_alpha:
      tokens.append(token.lower_) #save it as string and not spacy token. Note this is the lower case token
      try:
        val = elp_dict[token.lower_] #try to look up if token has values and, if so, assign to val
        #val = elp_dict[token.lemma_] #This would work for lemmas
        #print(val)
        hal_freq.append(val[1])if val[1] else None #if statement to see if there is value, else do not append anything
        conc.append(val[25])if val[25] else None #this happens for the word played, which has no concreteness score
      except KeyError: # handle the case when a key is not found in a dictionary during a dictionary lookup operation
        pass
  print(f'these are the lower-cased words in the text {tokens}')
  print(f'this is freq scores initial: {hal_freq}') #check on output
  print(f'this is concrete scores initial: {conc}')

  hal_freq = [float(item) for item in hal_freq] #change list of strings to floats
  conc = [float(item) for item in conc] #change list of strings to floats
  print(f'Is freq score now a float? {hal_freq}') #check on output

  print(f'this is the text length {len(tokens)}') # check to make sure length is right
  print("==" * 40)


  final_conc.append(safe_divide(sum(conc),len(conc))) #divide by number of words that have concreteness scores
  final_freq.append(safe_divide(sum(hal_freq),len(hal_freq)))
  final_nw.append(len(tokens))

print(final_conc)
print(final_freq)
print(final_nw)

The diminutive canine frolicked with unbridled exuberance on the lush, verdant lawn.
these are the lower-cased words in the text ['the', 'diminutive', 'canine', 'frolicked', 'with', 'unbridled', 'exuberance', 'on', 'the', 'lush', 'verdant', 'lawn']
this is freq scores initial: ['16.96', '5.69', '7.53', '3.58', '14.97', '5.8', '5.06', '15.08', '16.96', '6.93', '5.09', '7.89']
this is concrete scores initial: ['1.43', '2.57', '4.71', '2', '1.35', '1.66', '3.25', '1.43', '2.8', '4.93']
Is freq score now a float? [16.96, 5.69, 7.53, 3.58, 14.97, 5.8, 5.06, 15.08, 16.96, 6.93, 5.09, 7.89]
this is the text length 12
The friendly dog played joyfully on the green, grassy field.
these are the lower-cased words in the text ['the', 'friendly', 'dog', 'played', 'joyfully', 'on', 'the', 'green', 'grassy', 'field']
this is freq scores initial: ['16.96', '9.77', '10.97', '11.1', '5.16', '15.08', '16.96', '11.42', '6.34', '11.29']
this is concrete scores initial: ['1.43', '2.32', '4.85', '2.19', '3.25

In [18]:
# Can you do this without conveting elp_df to a dictionary?

# Yes you can... See below for Langdon's approach (but we will not cover this in class)


final_conc = [] #concreteness (will become a list of lists)
final_freq = [] #frequency
final_nw = [] #number of words

for doc in prac_df_docs:
  print(doc)
  tokens = [] #holder list for each text in docs
  hal_freq = []
  conc = []
  for token in doc:
    if token.is_alpha:
      tokens.append(token.lower_) #save it as string and not spacy token. Note this is the lower case token
      try:
        row = elp_df.loc[token.lower_] #try to look up if token has values and, if so, assign to val
      except KeyError: # handle the case when a key is not found in a dictionary during a dictionary lookup operation
        print(f"token.lower_ is out of vocabulary")
        pass
      row = row.fillna(value=False) # This replaces np.NaN (not a number) values with Python's False value, which evalutes as Falsy in the conditions below
      hal_freq.append(row["Freq_HAL"]) if row["Freq_HAL"] else None # if statement to see if there is value, else do not append anything
      conc.append(row["Conc.M"]) if row["Conc.M"] else None # this happens for the word played, which has no concreteness score
  print(f'these are the lopwer-cased words in the text {tokens}')
  print(f'this is freq scores initial: {hal_freq}') #check on output
  print(f'this is concrete scores initial: {conc}')

  print(f'this is the text length {len(tokens)}') # check to make sure length is right
  print("==" * 40)

  final_conc.append(safe_divide(sum(conc),len(conc))) #divide by number of words that have concreteness scores
  final_freq.append(safe_divide(sum(hal_freq),len(hal_freq)))
  final_nw.append(len(tokens))

print(final_conc)
print(final_freq)
print(final_nw)

The diminutive canine frolicked with unbridled exuberance on the lush, verdant lawn.
these are the lopwer-cased words in the text ['the', 'diminutive', 'canine', 'frolicked', 'with', 'unbridled', 'exuberance', 'on', 'the', 'lush', 'verdant', 'lawn']
this is freq scores initial: [23099033.0, 295.0, 1870.0, 36.0, 3184846.0, 330.0, 158.0, 3536061.0, 23099033.0, 1018.0, 162.0, 2663.0]
this is concrete scores initial: [1.43, 2.57, 4.71, 2.0, 1.35, 1.66, 3.25, 1.43, 2.8, 4.93]
this is the text length 12
The friendly dog played joyfully on the green, grassy field.
these are the lopwer-cased words in the text ['the', 'friendly', 'dog', 'played', 'joyfully', 'on', 'the', 'green', 'grassy', 'field']
this is freq scores initial: [23099033.0, 17486.0, 58314.0, 66128.0, 175.0, 3536061.0, 23099033.0, 90773.0, 567.0, 79920.0]
this is concrete scores initial: [1.43, 2.32, 4.85, 2.19, 3.25, 1.43, 4.07, 4.32, 4.26]
this is the text length 10
The small pet ran quickly around on the soft, green sujim gras

In [19]:
print(final_conc)
print(final_freq)
print(final_nw)

prac_df['NW'] = final_nw #create a new column in pandas called NW and assign it the values from final_nw
prac_df['concreteness'] = final_conc #create a new column in pandas called concreteness and assign it the values from final_conc
prac_df['hal_freq'] = final_freq

prac_df




[2.613, 3.1244444444444444, 3.2608333333333337]
[4410458.75, 5004749.0, 4204998.083333333]
[12, 10, 12]


Unnamed: 0,text,frequency,NW,concreteness,hal_freq
0,The diminutive canine frolicked with unbridled...,infreq,12,2.613,4410459.0
1,"The friendly dog played joyfully on the green,...",normal,10,3.124444,5004749.0
2,"The small pet ran quickly around on the soft, ...",freq,12,3.260833,4204998.0


## Your Turn

1. Call in the MRC psycholinguistic database.
2. Call in the readability corpus (reading_600_texts.csv) from Reading_2019 data.
3. Calculate one of the following for each excerpt in the corpus
  - average concreteness
  - imageability
  - meaningfulness
4. Run simple correlations to assess relationships.


    - What is your research question?
    - What are your hypotheses?


Research Questions
1.   Is there a significant relationship between meaningfulness and readability difficulty?

2.   Does imageability improve text comprehension as measured by readability metrics?
3.   Do more concrete words make a text easier to read?

Hypotheses


*   Higher concreteness scores will correlate with higher readability
*   Higher meaningfulness scores will correlate with higher readability scores

Start with loading MRC.csv and 2019-reading data

In [20]:
# Load the dataframe and use the first column ("#Word") as the index
mrc_df = pd.read_csv('MRC_database.csv', index_col=0)

# To get a list of values for each word
# First transpose with .T to make each word a column
# Then send the list of values under each column to a dictionary
mrc_dict = mrc_df.T.to_dict('list')
# display(mrc_df[mrc_df.index.duplicated(keep=False)])
print(mrc_dict)

  mrc_dict = mrc_df.T.to_dict('list')




In [21]:
# Read .csv into a dictionary

mrc_dict = {} #empty dictionary

with open('MRC_database.csv', 'r') as file: #read in file
    reader = csv.reader(file) # create a csv.reader
    next(reader)  # skip header
    for row in reader:
        key = row[0] # assign first element as key
        values = row[1:] # assign rest as values
        mrc_dict[key] = values # add key and values to dictionary

In [22]:
# Load the readability corpus
readability_path = "../reading_600_texts.csv"  # Update if needed
readability_df = pd.read_csv(readability_path, encoding='latin1')
readability_df.head()

Unnamed: 0,id,text,bt_readability,Source,Topic,Text type,Domain,Unnamed: 8
0,1,The Solar System is the Sun and all the object...,0.022084,https://simple.wikipedia.org/wiki/Solar_System,solar system,Simple,Science,
1,2,An atom is the basic unit that makes up all ma...,0.117281,https://simple.wikipedia.org/wiki/Atom,atom,Simple,Science,
2,3,An ion is an electrically charged atom or grou...,0.170116,https://simple.wikipedia.org/wiki/Ion,ion,Simple,Science,
3,4,Plasma is a state of matter. The three other c...,0.393468,https://simple.wikipedia.org/wiki/Plasma_(phys...,plasma,Simple,Science,
4,5,The South Pole is the most southern point on t...,0.007675,https://simple.wikipedia.org/wiki/South_Pole,South Pole,Simple,Science,


In [27]:
# Standardize column names for easier reference
mrc_df.rename(columns={"#word": "Word", "conc": "Concreteness", "imag": "Imageability", "meanc": "Meaningfulness"}, inplace=True)

# Trim whitespace from column names
mrc_df.columns = mrc_df.columns.str.strip()


Switch the MRC content to lowercase:

In [31]:
for key in list(mrc_dict.keys()): #loop through each key in dictionary
    mrc_dict[key.lower()] = mrc_dict.pop(key)

In [36]:

#this is for all words... we may consider removing stop words when the time comes
#this is also by word, we may want to consider doing it by lemma
#for instance, the word "played" has no concreteness score, but "play", the lemma, does

final_conc = [] #concreteness (will become a list of lists)
final_freq = [] #frequency
final_nw = [] #number of words

for doc in prac_df_docs:
  print(doc)
  tokens = [] #holder list for each text in docs
  hal_freq = []
  conc = []
  for token in doc:
    if token.is_alpha:
      tokens.append(token.lower_) #save it as string and not spacy token. Note this is the lower case token
      try:
        val = mrc_dict[token.lower_] #try to look up if token has values and, if so, assign to val
        hal_freq.append(val[1])if val[1] else None #if statement to see if there is value, else do not append anything
        conc.append(val[25])if val[25] else None #this happens for the word played, which has no concreteness score
      except KeyError: # handle thfe case when a key is not found in a dictionary during a dictionary lookup operation
        pass
  print(f'these are the lower-cased words in the text {tokens}')
  print(f'this is freq scores initial: {hal_freq}') #check on output
  print(f'this is concrete scores initial: {conc}')

  hal_freq = [float(item) for item in hal_freq] #change list of strings to floats
  conc = [float(item) for item in conc] #change list of strings to floats
  print(f'Is freq score now a float? {hal_freq}') #check on output

  print(f'this is the text length {len(tokens)}') # check to make sure length is right
  print("==" * 40)


  final_conc.append(safe_divide(sum(conc),len(conc))) #divide by number of words that have concreteness scores
  final_freq.append(safe_divide(sum(hal_freq),len(hal_freq)))
  final_nw.append(len(tokens))

print(final_conc)
print(final_freq)
print(final_nw)

The diminutive canine frolicked with unbridled exuberance on the lush, verdant lawn.


IndexError: list index out of range