This code is to classify the selected examples automatically for those criteria that allow automatic classification.

In [49]:
import pandas as pd
import xlrd
import re
from re import search

In [50]:
# opens the excel sheet and reads it
# excel sheet needs to be in the same working directory as this file

file = 'qui_selected_rows_sorted.xls'
data = pd.ExcelFile(file)

# returns the all the sheet names within the excel file
print(data.sheet_names)

['Sheet1']


In [51]:
# parses the sheet into a data frame to show the column structure within the file

data_frame = data.parse('Sheet1')
data_frame.info

# only shows the first 10 rows
data_frame.head(10)

Unnamed: 0,recherche,occurrence,wh,int,other,segment,enregistrement,transcription,locuteur,debut (ms),fin (ms),debut (hh:mm:ss),fin (hh:mm:ss)
0,Mot exact,qui,,,,trés bien qui habitent la rue hein ?,ESLO1_ENT_017,ESLO1_ENT_017_C,QB100,97160.0,98641.0,0.001123,0.001134
1,Mot exact,qui,,,,qui habitent le ah bon ?,ESLO1_ENT_017,ESLO1_ENT_017_C,CS,370289.0,371481.0,0.004282,0.004294
2,Mot exact,qui,,,,qu'est-ce qui vous plaît et qu'est-ce qui vous...,ESLO1_ENT_017,ESLO1_ENT_017_C,CS,730114.0,738206.0,0.008449,0.008542
3,Mot exact,qui,,,,quelles sont les choses qui vous plaisent le p...,ESLO1_ENT_017,ESLO1_ENT_017_C,CS,734543.0,738206.0,0.008495,0.008542
4,Mot exact,qui,,,,qu'est-ce que vous pensez des des femmes marié...,ESLO1_ENT_017,ESLO1_ENT_017_C,CS,768372.0,772569.0,0.008889,0.008935
5,Mot exact,qui,,,,qu'est-ce qui leur plaît dans leurs loisirs ?,ESLO1_ENT_017,ESLO1_ENT_017_C,QB100,1031781.0,1038519.0,0.011933,0.012014
6,Mot exact,qui,,,,alors à Orléans qui est-ce qui va à la maison ...,ESLO1_ENT_017,ESLO1_ENT_017_C,CS,1062853.0,1065278.0,0.012292,0.012326
7,Mot exact,qui,,,,à votre avis qu'est-ce qui fait que les enfant...,ESLO1_ENT_017,ESLO1_ENT_017_C,CS,1366299.0,1373565.0,0.01581,0.015891
8,Mot exact,qui,,,,quelles sont les p- les personnes qui comptent...,ESLO1_ENT_017,ESLO1_ENT_017_C,CS,1859247.0,1864205.0,0.021516,0.021574
9,Mot exact,qui,,,,vous pouvez expliquer un peu ce qui s'est passé ?,ESLO1_ENT_017,ESLO1_ENT_017_C,CS,2026649.0,2038217.0,0.023449,0.023588


In [52]:
# reads in the spreadsheet data

workbook = xlrd.open_workbook('qui_selected_rows_sorted.xls')
sheet = workbook.sheet_by_name('Sheet1')

row_count = sheet.nrows
col_count = sheet.ncols
print(f'Total rows: {row_count}\nTotal columns: {col_count}')

Total rows: 3162
Total columns: 13


We're now going to create a new list of lists out of the file, and populate the empty columns with as much data as possible.

The data that should be easy to classify automatically are: est-ce que / wh- at the beginning or end of sentence / presence of c'est.

In [53]:
# initialises list of lists
all_rows = []
unwanted_segment = 'qu\'est-ce\squi'

for rx in range(sheet.nrows): # rx is an int, row is a list 
    row = sheet.row(rx)
    
    # initialises list that will contain our data
    data_list = []

    # populates list that stores all rows
    data_list.append(row[0].value)
    data_list.append(row[1].value)
    data_list.append(row[2].value) # wh
    data_list.append(row[3].value) # int
    data_list.append(row[4].value) # other
    # excludes all utterances containing 'qu'est-ce qui'
    utterance = row[5].value
    if search(unwanted_segment, utterance):
        # excludes unwanted segments
        continue
    else:
        data_list.append(utterance) # segment
    data_list.append(row[6].value)
    data_list.append(row[7].value)
    data_list.append(row[8].value)
    data_list.append(row[9].value)
    data_list.append(row[10].value)
    data_list.append(row[11].value)
    data_list.append(row[12].value)

    all_rows.append(data_list)
        
# prints extracted rows metrics
print(f'We created a list that embeds {len(all_rows)} lists.')

We created a list that embeds 2656 lists.


In [54]:
# iterates through list of lists and populates empty cells

subject = 'qui\sest-ce\squi'
direct_object = 'qui\sest-ce\sque'
subject2 = 'qui\sc\'est\squi'
direct_object2 = 'qui\sc\'est\squé'
subject3 = 'c\'est\squi\squi'
direct_object3 = 'c\'est\squi\sque'

subject_count = 0
direct_object_count = 0

string = 'est-ce'
string_count = 0
string2 = 'c\'est\squi'
string2b = 'qui\sc\'est'
string2_count = 0

ex_situ_count = 0
in_situ_count = 0
fragment_count = 0
sv_count = 0

for list in all_rows:
    
    text = list[5] # index 5 is where text of interest is stored, returns a cell

    # checks whether 'qui' is S or DO
    if search(subject, text) or search(subject2, text) or search(subject3, text):
        list[1] = 'QuiS'
        subject_count += 1
    elif search(direct_object, text) or search(direct_object2, text) or search(direct_object3, text):
        list[1] = 'QuiDO'
        direct_object_count += 1

    if search(string, text):
        list[2] = 'ex situ'
        list[3] = 'est-ce que'
        list[4] = 'formed' # then check manually and change to 'fragment' if needed
        string_count += 1
    elif search(string2, text) or search(string2b, text):
        list[3] = 'cleft'

        if search(string2, text):
            list[2] = 'in situ'
        elif search(string2b, text):
            list[2] = 'ex situ'
        string2_count += 1

    if re.search('^qui\s', text, flags=0) or re.search('^et\squi\s', text, flags=0) or re.search('^alors\squi\s', text, flags=0) or re.search('^et\salors\squi\s', text, flags=0):
        list[2] = 'ex situ'
        list[4] = 'formed' # then check manually and change to 'fragment' if needed
        ex_situ_count += 1

        if search(string2, text) or search(string2b, text):
            list[3] = 'cleft'
        elif text.endswith('qui ?') or text.endswith('qui ça ?') or text.endswith('qui alors ?'):
            list[2] = 'wh'
            list[3] = 'none'
            list[4] = 'fragment'
            fragment_count += 1
        elif re.search('^qui\sje', text, flags=0) or re.search('^qui\stu', text, flags=0) or re.search('^qui\selle', text, flags=0) or re.search('^qui\sil', text, flags=0) or re.search('^qui\snous', text, flags=0) or re.search('^qui\svous', text, flags=0) or re.search('^qui\selles', text, flags=0) or re.search('^qui\sils', text, flags=0) or re.search('^qui\sça', text, flags=0):
            list[3] = 'SV'
            list[4] = 'formed'
            sv_count += 1
        elif re.search('^qui\sje', text, flags=0):
            list[3] = 'que'

    elif text.endswith('qui ?'):
        list[2] = 'in situ'
        list[3] = 'SV'
        list[4] = 'final'
        in_situ_count += 1
        
    elif text.endswith('qui alors ?') or text.endswith('qui ça ?') or text.endswith('qui madame ?') or text.endswith('qui monsieur ?') or text.endswith('qui vous ?'):
        list[2] = 'in situ'
        list[3] = 'SV'
        list[4] = 'non final'
        in_situ_count += 1

print(f'{subject_count} occurrences of subject \'qui\' found.')
print(f'{direct_object_count} occurrences of direct object \'qui\' found.')
print(f'{string_count} occurrences of \'est-ce que\' found.')
print(f'{string2_count} occurrences of \'c\'est\' found.')
print(f'{ex_situ_count} occurrences of \'qui\' ex situ found.')
print(f'{in_situ_count} occurrences of \'qui\' in situ found.')
print(f'{fragment_count} occurrences of fragments found.')
print(f'{sv_count} occurrences of SV ordering found.')

229 occurrences of subject 'qui' found.
15 occurrences of direct object 'qui' found.
697 occurrences of 'est-ce que' found.
114 occurrences of 'c'est' found.
703 occurrences of 'qui' ex situ found.
9 occurrences of 'qui' in situ found.
25 occurrences of fragments found.
43 occurrences of SV ordering found.


We will now create a new .xlsx file with all the classified data.

In [55]:
# creates spreadsheet using each nested list as a row

df = pd.DataFrame(data=all_rows)

# converts into excel
df.to_excel("qui_all_rows_classified.xlsx", index=False)

print("Dictionary converted into excel...")

Dictionary converted into excel...


In [56]:
# opens the new excel file and reads it

file2 = 'qui_all_rows_classified.xls'
data2 = pd.ExcelFile(file2)

FileNotFoundError: [Errno 2] No such file or directory: 'qui_all_rows_classified.xls'

In [None]:
# parses the sheet into a data frame to show the column structure within the file

data_frame2 = data2.parse('Sheet1')
data_frame2.info

# only shows the first 60 rows
data_frame2.head(60)