In [None]:
# This Dataset is supposed to return a speech divided into paragraphs as well as the party.

class ParagraphDataset(Dataset):
    def __init__(self,speeches,parties,start_index,stop_index,embedding_model=None,file_name=""):
        assert len(speeches) == len(parties)
        self.length = stop_index-start_index
        self.party_order = ["S","M","SD","V","MP","C","KD","L"]
        self.party_indices = {}
        self.embeddings_file_name = file_name
        for i, p in enumerate(self.party_order):
            self.party_indices[p] = i
        print(self.party_indices)
        self.party_letters = parties[start_index:stop_index]
        self.party_numbers = torch.Tensor([self.party_indices.get(p) for p in self.party_letters]).long()
        no_paragraphs = 0
        paragraph_list = []
        count = 0
        for sp in speeches[start_index:stop_index]:
            count += 1
            for pa in sp:
                paragraph_list.append(pa)
                no_paragraphs += 1
        print(count)
        print("The amount of paragraphs are", str(len(paragraph_list)))
        if embedding_model!=None:
            print("Encoding sentences...")
            
            paragraph_embeddings = embedding_model.encode(paragraph_list,show_progress_bar=True)
            # Save the data to load later
            torch.save(paragraph_embeddings,f=self.embeddings_file_name)
        else:
            print("Try to load data from jsonl file...")
            paragraph_embeddings = torch.load(self.embeddings_file_name)
        self.speeches = []
        self.speech_texts = []
        index = 0
        for i_s, sp in enumerate(speeches[start_index:stop_index]):
            self.speeches.append([])
            self.speech_texts.append(sp)
            for i_p, pa in enumerate(sp):
                self.speeches[-1].append(paragraph_embeddings[index])
                index += 1
        assert index == no_paragraphs
        print("Number of paragraphs: ",str(index))
        print("Number of parties: ",str(len(self.party_numbers)))
        print("Number of speeches: ",str(len(self.speeches)))


    def dump(self,filename):
        with open(filename, "w") as f:
            for i in range(self.length):
                data = {"party":self.party_letters[i],"party number":int(self.party_numbers[i]),"speech":self.speech_texts[i]}
                #data["speech embeddings"] = 
                json.dump(data, f,indent=2)


    def __len__(self):
        return self.length
    
    # Get speech at a certain index
    def __getitem__(self, index):
        # The index list decides which paragraphs to use
        speech = self.speeches[index]
        #print(speech[0].shape,speech[1].shape)
        party = self.party_numbers[index]
        return torch.Tensor(speech), torch.Tensor(party)

In [None]:
speeches = []
parties = []
all_sentences = []
all_paragraphs = []

possible_parties = ["S","M","SD","V","MP","C","KD","L"]

# The directory where data is located
data_dirs = ["speech_data/19_20/","speech_data/20_21/","speech_data/21_22/","speech_data/22_23/"]
year_indices = [0,0,0,0]
data = []
i = 0
for year, data_dir in enumerate(data_dirs):
    year_indices[year] = len(speeches)
    for file in os.listdir(data_dir):
        if file.endswith(".json"):
            with open(data_dir+file,"r") as f:
                data = json.load(f)
                speech = data["anforande"]["anforandetext"]
                party = data["anforande"]["parti"]
                # If it is a party who has spoken,
                if party in possible_parties:
                    soup = BeautifulSoup(speech, 'html.parser')
                    paragraphs = []
                    for paragraph in soup.find_all("p"):
                        text = paragraph.get_text()
                        if text != "":
                            
                            """sentences = []
                            for s in re.split("\. |\! |\? ",text):
                                if s != "":
                                    sentences.append(s)
                                    all_sentences.append(s)
                            if sentences != []:
                                all_paragraphs.append(text)"""
                            all_paragraphs.append(text)
                            paragraphs.append(text)
                    if paragraphs != []:
                        parties.append(party)
                        speeches.append(paragraphs)

print(year_indices)
assert len(parties) == len(speeches)

In [None]:
def collate_fn(data):
    #print("Before padding (first in embedding):")
    #print([d[0][:,0] for d in data])
    input = torch.nn.utils.rnn.pad_sequence([torch.flip(d[0],[0]) for d in data], batch_first=True)
    input = torch.flip(input,[1])
    #print("After padding: ")
    #print(input[:,:,0])
    output = torch.tensor([d[1] for d in data])
    return input, output

In [None]:
class SpeechClassifier(nn.Module):
    """def __init__(self, input_size, hidden_size, output_size, batch_size):
        super(SpeechClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_size, output_size)
        self.batch_size = batch_size

    def forward(self,x):
        #x = torch.flip(x,[1])
        #x = x/torch.linalg.vector_norm(x,ord=2,dim=1)
        f, (h,c) = self.lstm(x)
        h = self.dropout(h)
        output = self.fc(h)
        return output.squeeze()"""

    def __init__(self, input_size, hidden_size, output_size, batch_size):
        super(SpeechClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True, num_layers=2, dropout=0.3)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(4*hidden_size, output_size)
        self.batch_size = batch_size

    def forward(self,x):
        #x = nn.functional.normalize(x)
        #x = torch.div(x,torch.linalg.vector_norm(x,ord=2,dim=1))
        f, (h,c) = self.lstm(x)
        #print("h: ",str(h.shape))
        #print("The rows of the hidden state for the first item: ")
        #for i in range(4):
        #    print("Row",str(i),str(h[i,0,:]))
        batch_size = h.shape[1]
        h = torch.permute(h,(1,0,2))
        h = torch.reshape(h,(batch_size,self.hidden_size*4))
        #print("Once h is permuted to shape ",str(h.shape),", it is the following for the first item")
        #print(str(h[0,:]))
        #print(str(h.shape))
        h = self.dropout(h)
        output = self.fc(h)
        return output.squeeze()


In [None]:
from tqdm import tqdm

optimizer = torch.optim.Adam(speech_classifier.parameters())
epochs = 50
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.00033, steps_per_epoch=len(train_loader), epochs=epochs)

speech_classifier.train()

train_losses = []
learning_rates = []
val_losses = []
#train_losses = torch.load("train_losses_model_2.pt")
#val_losses = torch.load("val_losses_model_2.pt")

#batch_size = 32

for epoch in tqdm(range(epochs)):
    speech_classifier.train()
    for i, (X_train, y_train) in enumerate(train_loader):
        y_pred = speech_classifier(X_train)
        loss = loss_fn(y_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        learning_rates.append(scheduler.get_last_lr()[0])
        train_losses.append(loss.item())
    # Evaluate
    speech_classifier.eval()
    for i, (X_val, y_val) in enumerate(val_loader):
        y_pred = speech_classifier(X_val)
        loss = loss_fn(y_pred, y_val)
        val_losses.append(loss.item())
    clear_output()
    print("Train loss: "+str(sum(train_losses[-train_stop:])/train_stop))
    print("Validation loss: "+str(sum(val_losses[-(val_stop-train_stop):])/(val_stop-train_stop)))
    print("Learning rate: "+str(learning_rates[-1]))
    
    