/
align_ibm1.m
executable file
·223 lines (185 loc) · 6.83 KB
/
align_ibm1.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
function AM = align_ibm1(trainDir, numSentences, maxIter, fn_AM)
%
% align_ibm1
%
% This function implements the training of the IBM-1 word alignment algorithm.
% We assume that we are implementing P(foreign|english)
%
% INPUTS:
%
% dataDir : (directory name) The top-level directory containing
% data from which to train or decode
% e.g., '/u/cs401/A2_SMT/data/Toy/'
% numSentences : (integer) The maximum number of training sentences to
% consider.
% maxIter : (integer) The maximum number of iterations of the EM
% algorithm.
% fn_AM : (filename) the location to save the alignment model,
% once trained.
%
% OUTPUT:
% AM : (variable) a specialized alignment model structure
%
%
% The file fn_AM must contain the data structure called 'AM', which is a
% structure of structures where AM.(english_word).(foreign_word) is the
% computed expectation that foreign_word is produced by english_word
%
% e.g., LM.house.maison = 0.5 % TODO
%
% Template (c) 2011 Jackie C.K. Cheung and Frank Rudzicz
% AM = align_ibm1('/h/u6/g0/00/g0quachb/Desktop/A2-401/testfiles/', 3, 20,
% '/h/u6/g0/00/g0quachb/Desktop/A2-401/testfiles/fn_AM')
global CSC401_A2_DEFNS
AM = struct();
% Read in the training data
[eng, fre] = read_hansard(trainDir, numSentences);
% Initialize AM uniformly
AM = initialize(eng, fre);
% Iterate between E and M steps
for iter=1:maxIter,
AM = em_step(AM, eng, fre);
end
% Save the alignment model
save( fn_AM, 'AM', '-mat');
end
% --------------------------------------------------------------------------------
%
% Support functions
%
% --------------------------------------------------------------------------------
function [eng, fre] = read_hansard(dir_path, numSentences)
%
% Read 'numSentences' parallel sentences from texts in the 'dir' directory.
%
% Important: Be sure to preprocess those texts!
%
% Remember that the i^th line in fubar.e corresponds to the i^th line in fubar.f
% You can decide what form variables 'eng' and 'fre' take, although it may be easiest
% if both 'eng' and 'fre' are cell-arrays of cell-arrays, where the i^th element of
% 'eng', for example, is a cell-array of words that you can produce with
%
% eng{i} = strsplit(' ', preprocess(english_sentence, 'e'));
%
%
% dir_path = '/u/cs401/A2_SMT/data/Hansard/Training/';
% numSentences = 100
eng = {};
fre = {};
% TODO: your code goes here.
readSent = 0;
data_dir_eng = dir([dir_path, './*e']);
length_of_DD = length(data_dir_eng);
for k=1:length_of_DD
if not(data_dir_eng(k).isdir)
curr_file_eng = data_dir_eng(k).name;
curr_file_fre = [curr_file_eng(1:length(curr_file_eng) - 1), 'f'];
eng_data = textread([dir_path, curr_file_eng], '%s', 'delimiter', '\n');
fre_data = textread([dir_path, curr_file_fre], '%s', 'delimiter', '\n');
for i=1:length(eng_data)
readSent = readSent + 1;
eng{readSent} = strsplit(' ', preprocess(char(eng_data(i)), 'e'));
fre{readSent} = strsplit(' ', preprocess(char(fre_data(i)), 'f'));
if readSent == numSentences
return
end
end
end
end
end
function AM = initialize(eng, fre)
%
% Initialize alignment model uniformly.
% Only set non-zero probabilities where word pairs appear in corresponding sentences.
%
AM = {}; % AM.(english_word).(foreign_word)
% Initialize AM
len = length(eng);
for k=1:len
eng_cell_sent = eng{k};
fre_cell_sent = fre{k};
len_eng = length(eng_cell_sent);
len_fre = length(fre_cell_sent);
for e=1:len_eng
curr_eword = char(eng_cell_sent(e));
if not(isfield(AM, curr_eword));
AM.(curr_eword) = {};
end
for f=1:len_fre
curr_fword = char(fre_cell_sent{f});
if not(isfield(AM.(curr_eword), curr_fword))
AM.(curr_eword).(curr_fword) = 0;
end
end
end
end
% Convert to vectors of possible french words to uniform probabilities.
eng_fields = fieldnames(AM);
eng_field_len = length(eng_fields);
for a=1:eng_field_len
curr_eng_field = eng_fields{a};
fre_fields = fieldnames(AM.(curr_eng_field));
fre_field_len = length(fre_fields);
for b=1:fre_field_len
curr_fre_field = fre_fields{b};
AM.(curr_eng_field).(curr_fre_field) = 1 / fre_field_len;
end
end
return
end
function AM = em_step(AM, eng, fre)
%
% One step in the EM algorithm.
%
tcount = AM;
total = {};
% set tcount(f, e) to 0 for all f, e
eng_fields = fieldnames(tcount);
eng_field_len = length(eng_fields);
for a=1:eng_field_len
curr_eng_field = eng_fields{a};
% set total(e) to 0 for all e
total.(curr_eng_field) = 0;
fre_fields = fieldnames(tcount.(curr_eng_field));
fre_field_len = length(fre_fields);
for b=1:fre_field_len
curr_fre_field = fre_fields{b};
tcount.(curr_eng_field).(curr_fre_field) = 0;
end
end
len_eng = length(eng);
% a is sentence index.
for a=1:len_eng
unique_eng_sent = unique(eng{a});
unique_fre_sent = unique(fre{a});
len_uniq_eng = length(unique_eng_sent);
len_uniq_fre = length(unique_fre_sent);
for uf_ind=1:len_uniq_fre
curr_fre_word = char(unique_fre_sent{uf_ind});
denom_c = 0;
countf = length(find(ismember(fre{a}, cellstr(curr_fre_word))));
for ue_ind=1:len_uniq_eng
curr_eng_word = char(unique_eng_sent{ue_ind});
addon = AM.(curr_eng_word).(curr_fre_word) * countf;
denom_c = denom_c + addon;
end
for ue_ind=1:len_uniq_eng
curr_eng_word = char(unique_eng_sent{ue_ind});
counte = length(find(ismember(eng{a}, cellstr(curr_eng_word))));
addon = AM.(curr_eng_word).(curr_fre_word) * countf * counte / denom_c;
tcount.(curr_eng_word).(curr_fre_word) = tcount.(curr_eng_word).(curr_fre_word) + addon;
total.(curr_eng_word) = total.(curr_eng_word) + addon;
end
end
end
for e=1:eng_field_len
curr_eng_word = char(eng_fields{e});
fre_fields = fieldnames(tcount.(curr_eng_word));
len_of_fre_fields = length(fre_fields);
for f=1:len_of_fre_fields
curr_fre_word = char(fre_fields{f});
AM.(curr_eng_word).(curr_fre_word) = tcount.(curr_eng_word).(curr_fre_word) / total.(curr_eng_word);
end
end
return
end