This code is to classify the selected examples automatically for those criteria that allow automatic classification.

In [1]:
import pandas as pd
import xlrd
import re
from re import search

In [2]:
# opens the excel sheet and reads it
# excel sheet needs to be in the same working directory as this file

file = 'quoi_selected_rows_sorted.xls'
data = pd.ExcelFile(file)

# returns the all the sheet names within the excel file
print(data.sheet_names)

['Sheet1']


In [3]:
# parses the sheet into a data frame to show the column structure within the file

data_frame = data.parse('Sheet1')
data_frame.info

# only shows the first 10 rows
data_frame.head(10)

Unnamed: 0,0,1,wh,int,other,9,2,3,4,5,6,7,8
0,Mot exact,quoi,,,,est-ce que vous pouvez me dire euh en quoi ça ...,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,217697,222152.0,0.002512,0.002569
1,Mot exact,quoi,,,,en quoi est-ce que ça consiste exactement ?,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,299545,301282.0,0.003461,0.003484
2,Mot exact,quoi,,,,de mieux en mieux oui et à quoi cela tient d'a...,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,2786889,2789314.0,0.032245,0.03228
3,Mot exact,quoi,,,,et le le fait que l'orthographe soit mieux ens...,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,3026630,3030133.0,0.035023,0.035069
4,Mot exact,quoi,,,,et un une autre matière deuxième matière aurai...,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,3059914,3062962.0,0.035405,0.03544
5,Mot exact,quoi,,,,euh quoi par exemple ?,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,3606863,3607971.0,0.041736,0.041748
6,Mot exact,quoi,,,,de quoi s'agit-il ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,274971,276383.0,0.003171,0.003194
7,Mot exact,quoi,,,,et vous écrivez avec quoi ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,510327,512058.0,0.005903,0.005926
8,Mot exact,quoi,,,,la quoi ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,2384465,2385423.0,0.027593,0.027604
9,Mot exact,quoi,,,,c'est quoi exactement ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,2823321,2824542.0,0.032674,0.032685


In [4]:
# reads in the spreadsheet data

workbook = xlrd.open_workbook('quoi_selected_rows_sorted.xls')
sheet = workbook.sheet_by_name('Sheet1')

row_count = sheet.nrows
col_count = sheet.ncols
print(f'Total rows: {row_count}\nTotal columns: {col_count}')

Total rows: 2359
Total columns: 13


We're now going to create a new list of lists out of the file, and populate the empty columns with as much data as possible.

The data that should be easy to classify automatically are: est-ce que / wh- at the beginning or end of sentence / presence of c'est.

In [11]:
# initialises list of lists
all_rows = []
unwanted_segment = 'n\'importe\squoi'
# unwanted_segment2 = 'quoi\sfaire'
unwanted_segment3 = 'ou\squoi'

for rx in range(sheet.nrows): # rx is an int, row is a list 
    row = sheet.row(rx)
    
    # initialises list that will contain our data
    data_list = []

    # populates list that stores all rows
    data_list.append(row[0].value)
    data_list.append(row[1].value)
    data_list.append(row[2].value) # wh
    data_list.append(row[3].value) # int
    data_list.append(row[4].value) # other
    # excludes all utterances containing 'qu'est-ce qui'
    utterance = str(row[5].value)
    if search(unwanted_segment, utterance) or search(unwanted_segment3, utterance):
        # excludes unwanted segments
        # print(utterance)
        continue
    else:
        data_list.append(utterance) # segment
    data_list.append(row[6].value)
    data_list.append(row[7].value)
    data_list.append(row[8].value)
    data_list.append(row[9].value)
    data_list.append(row[10].value)
    data_list.append(row[11].value)
    data_list.append(row[12].value)

    all_rows.append(data_list)
        
# prints extracted rows metrics
print(f'We created a list that embeds {len(all_rows)} lists.')

We created a list that embeds 2309 lists.


In [15]:
# iterates through list of lists and populates empty cells

indirect_object_count = 0

string = 'est-ce\sque'
string_count = 0
io_quoi = ['de quoi', 'en quoi', 'à quoi', 'pour quoi', 'avec quoi', 'par quoi', 'sur quoi']

string2 = 'c\'est\squoi'
string2_count = 0

ex_situ_count = 0
in_situ_count = 0
fragment_count = 0
sv_count = 0

for list in all_rows:
    
    text = list[5] # index 5 is where text of interest is stored, returns a cell

    # classifies sentence as IO
    for wh in io_quoi:
        if wh in text:
            list[1] = 'quoiIO'
            indirect_object_count += 1

    if search(string, text):
        list[2] = 'ex situ'
        list[3] = 'est-ce que'
        list[4] = 'formed' # then check manually and change to 'fragment' if needed
        string_count += 1
    
    if search(string2, text):
        list[2] = 'in situ'
        list[3] = 'cleft'
        if 'que' in text or 'qu\'' in text:
            list[4] = 'bi'
        else:
            list[4] = 'mono'
        string2_count += 1

    if re.search('^de\squoi\s', text, flags=0) or re.search('^et\sde\squoi\s', text, flags=0) or re.search('^alors\sde\squoi\s', text, flags=0) or re.search('^et\salors\sde\squoi\s', text, flags=0) or re.search('^en\squoi\s', text, flags=0) or re.search('^et\sen\squoi\s', text, flags=0) or re.search('^alors\sen\squoi\s', text, flags=0) or re.search('^et\salors\sen\squoi\s', text, flags=0) or re.search('^à\squoi\s', text, flags=0) or re.search('^et\sà\squoi\s', text, flags=0) or re.search('^alors\sà\squoi\s', text, flags=0) or re.search('^et\salors\sà\squoi\s', text, flags=0) or re.search('^pour\squoi\s', text, flags=0) or re.search('^et\spour\squoi\s', text, flags=0) or re.search('^alors\spour\squoi\s', text, flags=0) or re.search('^et\salors\spour\squoi\s', text, flags=0) or re.search('^avec\squoi\s', text, flags=0) or re.search('^et\savec\squoi\s', text, flags=0) or re.search('^alors\savec\squoi\s', text, flags=0) or re.search('^et\salors\savec\squoi\s', text, flags=0) or re.search('^par\squoi\s', text, flags=0) or re.search('^et\spar\squoi\s', text, flags=0) or re.search('^alors\spar\squoi\s', text, flags=0) or re.search('^et\salors\spar\squoi\s', text, flags=0) or re.search('^sur\squoi\s', text, flags=0) or re.search('^et\ssur\squoi\s', text, flags=0) or re.search('^alors\ssur\squoi\s', text, flags=0) or re.search('^et\salors\ssur\squoi\s', text, flags=0):
        list[2] = 'ex situ'
        list[4] = 'formed' # then check manually and change to 'fragment' if needed
        ex_situ_count += 1

        if text.endswith('quoi ?') or text.endswith('quoi ça ?') or text.endswith('quoi alors ?'):
            list[2] = 'wh'
            list[3] = 'none'
            list[4] = 'fragment'
            fragment_count += 1
        elif re.search('quoi\sje', text, flags=0) or re.search('quoi\stu', text, flags=0) or re.search('quoi\selle', text, flags=0) or re.search('quoi\sil', text, flags=0) or re.search('quoi\snous', text, flags=0) or re.search('quoi\svous', text, flags=0) or re.search('quoi\selles', text, flags=0) or re.search('quoi\sils', text, flags=0) or re.search('quoi\sça', text, flags=0) or re.search('quoi\son', text, flags=0):
            list[3] = 'SV'
            list[4] = 'formed'
            sv_count += 1
        elif re.search('^quoi\squ\'', text, flags=0):
            list[3] = 'que'

    elif text.endswith('quoi ?'):
        list[2] = 'in situ'
        list[3] = 'SV'
        list[4] = 'final'
        in_situ_count += 1
        
    elif text.endswith('quoi alors ?') or text.endswith('quoi ça ?') or text.endswith('quoi madame ?') or text.endswith('quoi monsieur ?') or text.endswith('quoi vous ?'):
        list[2] = 'in situ'
        list[3] = 'SV'
        list[4] = 'non final'
        in_situ_count += 1

# print metrics
print(f'{indirect_object_count} occurrences of indirect object \'quoi\' found.')
print(f'{string_count} occurrences of \'est-ce que\' found.')
print(f'{string2_count} occurrences of \'c\'est\' found.')
print(f'{ex_situ_count} occurrences of \'quoi\' ex situ found.')
print(f'{in_situ_count} occurrences of \'quoi\' in situ found.')
print(f'{fragment_count} occurrences of fragments found.')
print(f'{sv_count} occurrences of SV ordering found.')

922 occurrences of indirect object 'quoi' found.
147 occurrences of 'est-ce que' found.
348 occurrences of 'c'est' found.
416 occurrences of 'quoi' ex situ found.
979 occurrences of 'quoi' in situ found.
112 occurrences of fragments found.
71 occurrences of SV ordering found.


We will now create a new .xlsx file with all the classified data.

In [16]:
# creates spreadsheet using each nested list as a row

df = pd.DataFrame(data=all_rows)

# converts into excel
df.to_excel("quoi_all_rows_classified.xlsx", index=False)

print("Dictionary converted into excel...")

Dictionary converted into excel...


In [18]:
# opens the new excel file and reads it

file2 = 'quoi_all_rows_classified.xls'
data2 = pd.ExcelFile(file2)

In [None]:
# parses the sheet into a data frame to show the column structure within the file

data_frame2 = data2.parse('Sheet1')
data_frame2.info

# only shows the first 60 rows
data_frame2.head(60)