This code is to classify the selected examples automatically for those criteria that allow automatic classification.

In [25]:
import pandas as pd
import xlrd
import re
from re import search

In [26]:
# opens the excel sheet and reads it
# excel sheet needs to be in the same working directory as this file
# requires installation of xlrd >= 1.0.0 for Excel support (conversion to .xlsx required by openpyxl)

file = 'selected_rows_sorted.xls'
data = pd.ExcelFile(file)

 # returns the all the sheet names within the excel file
print(data.sheet_names)

['Sheet1']


In [27]:
# parses the sheet into a data frame to show the column structure within the file

data_frame = data.parse('Sheet1')
data_frame.info

# only shows the first 10 rows
data_frame.head(10)

Unnamed: 0,recherche,occurrence,wh,int,other,segment,enregistrement,transcription,locuteur,debut (ms),fin (ms),debut (hh:mm:ss),fin (hh:mm:ss)
0,Mot exact,comment,,,,comment vous voyez ça ?,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,302683,303581,0.003495,0.003507
1,Mot exact,comment,,,,et comment est-ce que ce contrôle devrait se ?,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,2973744,2976606,0.03441,0.034444
2,Mot exact,comment,,,,comment ça ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,148486,149249,0.001713,0.001725
3,Mot exact,comment,,,,comment dirais-je ? malpolis malhonnêtes oui,ESLO1_ENT_002,ESLO1_ENT_002_C,DE744,699597,702487,0.00809,0.008125
4,Mot exact,comment,,,,votre profession ça s'appelle comment ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,785852,787593,0.009086,0.009109
5,Mot exact,comment,,,,c'est triste comment est-ce que vous expl- vou...,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,1486866,1489410,0.017199,0.017234
6,Mot exact,comment,,,,aussi bien au point de vue euh comment dirais-...,ESLO1_ENT_002,ESLO1_ENT_002_C,DE744,4271706,4283592,0.049433,0.049572
7,Mot exact,comment,,,,comment les Maisons de Culture en général ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,4337408,4339130,0.050197,0.05022
8,Mot exact,comment,,,,est-ce que je peux vous demander euh comment v...,ESLO1_ENT_003,ESLO1_ENT_003_C,JR,1087017,1099316,0.012581,0.01272
9,Mot exact,comment,,,,comment ?,ESLO1_ENT_003,ESLO1_ENT_003_C,DJ39,1366017,1366517,0.01581,0.01581


In [28]:
# reads in the spreadsheet data

workbook = xlrd.open_workbook('selected_rows_sorted.xls')
sheet = workbook.sheet_by_name('Sheet1')

# gets the first sheet
sheet_1 = workbook.sheet_by_index(0)

row_count = sheet.nrows
col_count = sheet.ncols
print(f'Total rows: {row_count}\nTotal columns: {col_count}')

Total rows: 2297
Total columns: 13


We're now going to create a new list of lists out of the file, and populate the empty columns with as much data as possible.

The data that should be easy to classify automatically are: est-ce que / wh- at the beginning or end of sentence / presence of c'est.

In [29]:
# initialises list of lists
all_rows = []

for rx in range(sheet.nrows): # rx is an int, row is a list 
    row = sheet.row(rx)

    # initialises list that will contain our data
    data_list = []

    # populates list that stores all rows
    data_list.append(row[0].value)
    data_list.append(row[1].value)
    data_list.append(row[2].value) # wh
    data_list.append(row[3].value) # int
    data_list.append(row[4].value) # other
    data_list.append(row[5].value) # segment
    data_list.append(row[6].value)
    data_list.append(row[7].value)
    data_list.append(row[8].value)
    data_list.append(row[9].value)
    data_list.append(row[10].value)
    data_list.append(row[11].value)
    data_list.append(row[12].value)

    all_rows.append(data_list)
    # print(all_rows)
    
# prints extracted rows
print(f'We created a list that embeds {len(all_rows)} lists.')

We created a list that embeds 2297 lists.


In [30]:
# iterates through list of lists and populates empty cells

wh = 'comment'
string = "est-ce"
string_count = 0
string2 = 'c\'est'
string2_count = 0

ex_situ_count = 0
in_situ_count = 0
fragment_count = 0

for list in all_rows:
    text = list[5] # index 5 is where text of interest is stored, returns a cell

    if search(string, text):
        list[2] = 'ex situ'
        list[3] = 'est-ce que'
        string_count += 1
    elif search(string2, text):
        list[3] = 'cleft'
        string2_count += 1

    if re.search('^comment\s', text, flags=0):
        list[2] = 'ex situ'
        ex_situ_count += 1

        if search(string2, text):
            list[3] = 'cleft'
        elif text.endswith('comment ?') or text.endswith('comment ça ?'):
            list[2] = 'wh'
            list[3] = 'none'
            list[4] = 'fragment'
            fragment_count += 1

    elif text.endswith('comment ?'):
        list[2] = 'in situ'
        list[3] = 'SV'
        list[4] = 'final'
        in_situ_count += 1

    # print(list)

print(f'{string_count} occurrences of \'est-ce que\' found.')
print(f'{string2_count} occurrences of \'c\'est\' found.')
print(f'{ex_situ_count} occurrences of \'comment\' ex situ found.')
print(f'{in_situ_count} occurrences of \'comment\' in situ found.')
print(f'{fragment_count} occurrences of fragments found.')

435 occurrences of 'est-ce que' found.
101 occurrences of 'c'est' found.
1157 occurrences of 'comment' ex situ found.
189 occurrences of 'comment' in situ found.
370 occurrences of fragments found.


We will now create a new .xlsx file with all the classified data.

In [31]:
# creates spreadsheet using each nested list as a row

df = pd.DataFrame(data=all_rows)

# converts into excel
df.to_excel("all_rows_classified.xlsx", index=False)

print("Dictionary converted into excel...")

Dictionary converted into excel...


In [32]:
# opens the new excel sheet and reads it
# I created a .xls copy of the .xlsx file manually otherwise it won't open

file2 = 'all_rows_classified.xls'
data2 = pd.ExcelFile(file2)

# returns the all the sheet names within the excel file
print(data2.sheet_names)

['Sheet1']


In [33]:
# parses the sheet into a data frame to show the column structure within the file

data_frame2 = data2.parse('Sheet1')
data_frame2.info

# only shows the first 10 rows
data_frame2.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,recherche,occurrence,wh,int,other,segment,enregistrement,transcription,locuteur,debut (ms),fin (ms),debut (hh:mm:ss),fin (hh:mm:ss)
1,Mot exact,comment,ex situ,,,comment vous voyez ça ?,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,302683,303581,0.003495,0.003507
2,Mot exact,comment,ex situ,est-ce que,,et comment est-ce que ce contrôle devrait se ?,ESLO1_ENT_001,ESLO1_ENT_001_C,OU,2973744,2976606,0.03441,0.034444
3,Mot exact,comment,wh,none,fragment,comment ça ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,148486,149249,0.001713,0.001725
4,Mot exact,comment,ex situ,,,comment dirais-je ? malpolis malhonnêtes oui,ESLO1_ENT_002,ESLO1_ENT_002_C,DE744,699597,702487,0.00809,0.008125
5,Mot exact,comment,in situ,SV,final,votre profession ça s'appelle comment ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,785852,787593,0.009086,0.009109
6,Mot exact,comment,ex situ,est-ce que,,c'est triste comment est-ce que vous expl- vou...,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,1486866,1489410,0.017199,0.017234
7,Mot exact,comment,,,,aussi bien au point de vue euh comment dirais-...,ESLO1_ENT_002,ESLO1_ENT_002_C,DE744,4271706,4283592,0.049433,0.049572
8,Mot exact,comment,ex situ,,,comment les Maisons de Culture en général ?,ESLO1_ENT_002,ESLO1_ENT_002_C,JR,4337408,4339130,0.050197,0.05022
9,Mot exact,comment,ex situ,est-ce que,,est-ce que je peux vous demander euh comment v...,ESLO1_ENT_003,ESLO1_ENT_003_C,JR,1087017,1099316,0.012581,0.01272
