<a href="https://colab.research.google.com/github/Aidan-MG/RoomReader_Analysis/blob/main/RoomReader_Word_Frequencies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pydub
!pip install pympi-ling

import pandas as pd
pd.options.mode.chained_assignment = None
import pympi
import numpy as np
import re
from collections import Counter
from pydub import AudioSegment
from google.colab import sheets

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting pympi-ling
  Downloading pympi_ling-1.70.2-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading pympi_ling-1.70.2-py2.py3-none-any.whl (24 kB)
Installing collected packages: pympi-ling
Successfully installed pympi-ling-1.70.2


In [36]:
#create a df with all tokens and their number of occurences across conversations

#read all text files and convert to a df with two columns: word, frequency

df = pd.DataFrame()

for file in ['S01', 'S02', 'S03', 'S04', 'S05', 'S06', 'S07', 'S08', 'S09', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30']:
  with open(file + '.txt', 'r') as text:
    #read the file as a csv
    text = pd.read_csv(file + '.txt', sep='\t')
    #for the text in the column labeled 'Content' create a word list from all text
    word_list = pd.DataFrame()

    for index, row in text.iterrows():
      #add the text['Content'] to the word list for each row
      if isinstance(row['content'], str):
        word_list = pd.concat([word_list, pd.DataFrame(row['content'].split())])

    #save the word_list to the df
    df = pd.concat([df, word_list])

In [37]:
#count the occurrence of each word, aggregating the rows and storing the number of occurences in the column 'Total_Frequency'
df_overall_wordlist = df.value_counts().reset_index()
df_overall_wordlist.columns = ['Word', 'Total_Frequency']

print(df_overall_wordlist.head())
print(df_overall_wordlist.info())

  Word  Total_Frequency
0    I             4583
1  the             3781
2  you             3027
3    a             2735
4   of             2711
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8046 entries, 0 to 8045
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Word             8046 non-null   object
 1   Total_Frequency  8046 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 125.8+ KB
None


In [16]:
#determine overlaps for all tiers of ELAN subtitles

def process_elan_file(elan_file_path):
  #next step: create a method for creating the overlap list for all the speakers in the conversation
  #takes in ELAN file, returns a DataFrame with the overlaps in format (start_time, end_time)
  def get_overlaps(eaf_file):
    eaf = pympi.Elan.Eaf(eaf_file)
    overlaps = pd.DataFrame()
    #iterate over tiers
    for tier_a in eaf.get_tier_names():
      if 'utt' in tier_a:
        for tier_b in eaf.get_tier_names():
          if 'utt' in tier_b and tier_a != tier_b:
            overlaps_temp = pd.DataFrame(list(eaf.get_gaps_and_overlaps2(tier_a, tier_b, maxlen=-1)))
            overlaps_temp = overlaps_temp[overlaps_temp[2] == 'O12']
            overlaps = pd.concat([overlaps, overlaps_temp])

    overlaps = overlaps.drop(columns=[2])
    overlaps.columns = ['start_time1', 'end_time1']
    return overlaps

  overlap_times = get_overlaps(elan_file_path)

  #get word list from overlapping sections of ELAN file

  def get_words(txt_file, overlap_times, window_size):
    text = pd.read_csv(txt_file, sep='\t')
    words = pd.DataFrame()
    for index, row in overlap_times.iterrows():
      start_time = row['start_time1']
      end_time = row['end_time1']
      words_temp = text[(text['start_time1'] >= start_time - window_size) & (text['end_time1'] <= end_time + window_size)]
      words = pd.concat([words, words_temp])
      words.drop_duplicates(inplace=True)
    words_final = words['content']
    return words_final

  txt_file_path = elan_file_path.replace('.eaf', '.txt')
  word_list_initial = get_words(txt_file_path, overlap_times, 100)

  #tokenize word list
  word_list = word_list_initial.to_string().split()

  #count frequency of individual words, dropping single occurrence items
  #occurrence = {item: word_list.count(item) for item in word_list}
  occurrence = Counter(word_list)


  for word in word_list:
    if occurrence.get(word) == 1:
      word_list.remove(word)

  word_series = pd.DataFrame({
      'Word': word_list,
      'Frequency': [occurrence[word] for word in word_list]
    })

  #filter non word elements
  def is_word(text):
        pattern = re.compile(r'^[a-zA-Z]+$')
        return bool(pattern.match(text))

  df_filtered = word_series[word_series['Word'].apply(is_word)]

  #compare with average rate of occurrence of word in the entire conversation, to see which words appear with higher frequency around moments of overlap
  #to compare them, creating a column with frequency_overlaps/frequency_overall

  with open(txt_file_path, 'r') as text:
    complete_wordlist = text.read().split()
    total_occurrence = {item: complete_wordlist.count(item) for item in complete_wordlist}
      # Calculate relative frequency
    df_filtered['Relative Frequency'] = df_filtered['Word'].apply(lambda word: df_filtered[df_filtered['Word'] == word]['Frequency'].values[0] / total_occurrence.get(word, 1))


  df_filtered.sort_values(by='Relative Frequency', ascending=False, inplace=True)
  df_filtered.drop_duplicates(subset='Word', keep='first', inplace=True)
  return df_filtered.head(200)

# Example usage:
# Replace with your actual file paths
elan_files = ['/content/S01.eaf', '/content/S02.eaf', '/content/S03.eaf', '/content/S04.eaf', '/content/S05.eaf', '/content/S06.eaf', '/content/S07.eaf', '/content/S08.eaf', '/content/S09.eaf', '/content/S10.eaf', '/content/S11.eaf', '/content/S12.eaf', '/content/S13.eaf', '/content/S14.eaf', '/content/S15.eaf', '/content/S16.eaf', '/content/S17.eaf', '/content/S18.eaf', '/content/S19.eaf', '/content/S20.eaf', '/content/S21.eaf', '/content/S22.eaf', '/content/S23.eaf', '/content/S24.eaf', '/content/S25.eaf', '/content/S26.eaf', '/content/S27.eaf', '/content/S28.eaf', '/content/S29.eaf', '/content/S30.eaf']
results = {}
for elan_file in elan_files:
  results[elan_file] = process_elan_file(elan_file)



In [40]:
# Concatenate the results into a single DataFrame
all_data = pd.concat([result for result in results.values()], ignore_index=True)

# Group by 'Word' and sum the frequencies
merged_df = all_data.groupby('Word').agg({'Frequency': 'sum'}).reset_index()

#rename merged_df 'Frequency' to 'Overlap_Frequency'
merged_df.rename(columns={'Frequency': 'Overlap_Frequency'}, inplace=True)

#rename df_overallwordlist 'Frequency' to 'Total_Frequency'
df_overall_wordlist.rename(columns={'Frequency': 'Total_Frequency'}, inplace=True)

#add columns for df_overallwordlist to the merged_df file
merged_df = pd.merge(merged_df, df_overall_wordlist, on='Word', how='left')

#create column with the value 'Overlap_Frequency' / 'Total_Frequency'
merged_df['Relative_Frequency'] = merged_df['Overlap_Frequency'] / merged_df['Total_Frequency']

#sort by 'Total_Frequency'
merged_df.sort_values(by='Total_Frequency', ascending=False, inplace=True)

In [49]:
merged_df.head(50)

Unnamed: 0,Word,Overlap_Frequency,Total_Frequency,Relative_Frequency
253,I,2475,4583.0,0.540039
1537,the,440,3781.0,0.116371
1705,you,216,3027.0,0.071358
614,a,536,2735.0,0.195978
1250,of,331,2711.0,0.122095
1569,to,76,2553.0,0.029769
1133,like,284,2506.0,0.113328
655,and,102,2311.0,0.044137
1536,that,139,1972.0,0.070487
1116,laughter,1866,1866.0,1.0


In [50]:
#create a new df focusing on relative_frequency column
df_relative_frequency = merged_df
#drop the values with 'Total_Frequency' less than ten
df_relative_frequency = df_relative_frequency[df_relative_frequency['Total_Frequency'] >= 10]
#sort by 'Relative_Frequency'
df_relative_frequency.sort_values(by='Relative_Frequency', ascending=False, inplace=True)



In [51]:
df_relative_frequency.head(50)

Unnamed: 0,Word,Overlap_Frequency,Total_Frequency,Relative_Frequency
1073,inhaling,199,199.0,1.0
826,cough,23,23.0,1.0
1116,laughter,1866,1866.0,1.0
1446,sigh,36,36.0,1.0
1645,vocalized,91,91.0,1.0
790,clicking,97,98.0,0.989796
1576,tongue,95,96.0,0.989583
1237,noise,170,172.0,0.988372
560,Uhm,597,619.0,0.964459
559,Uh,330,348.0,0.948276
