In [2]:
using Statistics
using Metrics
using Knet

In [3]:
# Reading data for sentiment classification
# it's already divided into train/dev/test

function read_sentiment_data(file_path)
    labels = []
    data = []
    try
        open(file_path) do f
            while ! eof(f)      
                s = readline(f) 
                s = lowercase.(s)
                s = replace.(s, r"[,.:;?!()]" => "")
                s_with_spaces = replace(s, '\t' => ' ') 
                temp = split(s_with_spaces," ")
                push!(labels,temp[end-1])
                push!(data,temp[2:end-2])
            end
        end
    catch 
        println("file doesn't exist")
    end
    return data, labels
end

path = raw"C:\Users\brkcn\HIT-ACL2021-Codemixed-Representation\data\malayalam\malayalam_train.tsv"
#path = raw"C:\Users\brkcn\HIT-ACL2021-Codemixed-Representation\data\tamil\tamil_train.tsv"
#path = raw"C:\Users\brkcn\HIT-ACL2021-Codemixed-Representation\data\hindi_sentiment\IIITH_Codemixed.txt"
sentiment_data, sentiment_labels = read_sentiment_data(path);



In [4]:
# Reading data for machine translation
# it's already divided into train/dev/test
function read_mt_data(file_path)
    data = []
    try
        open(file_path) do f
            while ! eof(f)      
                s = readline(f) 
                s = lowercase.(s)
                s = replace.(s, r"[,.:;?!()]" => "")
                s = replace(s, '\t' => ' ') 
                s = split(s," ")
                push!(data,s)
            end
        end
    catch 
        println("file doesn't exist")
    end
    data
end

path = raw"C:\Users\brkcn\HIT-ACL2021-Codemixed-Representation\data\IITPatna-CodeMixedMT\train.src"
mt_data= read_mt_data(path);


In [5]:
# Reading data for pos tagging
function read_pos_tagging_data(file_path)
    words = []
    langs = []
    tags = []
    try
        open(file_path) do f
            while ! eof(f)
                s = readline(f) 
                if length(s) != 0
                    if s[1] != '@'
                        s = replace(s, '\t' => ' ') 
                        s = split(s," ")
                        push!(words,s[1])
                        push!(langs,s[2])
                        push!(tags,s[3])
                    end
                end
            end
        end
    catch 
        println("error during reading pos tagging data")
    end
    words, langs, tags
end
path = raw"C:\Users\brkcn\HIT-ACL2021-Codemixed-Representation\data\bengali_POS\TWT_BN_EN_FN.txt"
pos_word, pos_langs, pos_tags = read_pos_tagging_data(path);



In [6]:
#Randomly predcting for now
function pred_sentiment(X)
    preds = []
    for sentence in X
        push!(preds,rand(1:4))
    end
    preds
end


function vec_labels(y)
    num_labels = []
    for label in y
        if label == "positive"
            push!(num_labels,1)
        elseif label == "negative"
            push!(num_labels,2)
        elseif label == "mixed_feelings"
            push!(num_labels,3)
        else
            push!(num_labels,4)
        end
    end
    num_labels
end



vec_labels (generic function with 1 method)

In [35]:
function calculate_metrics(pred,y,num_classes)
    y_len = length(y)
    conf_mat = zeros(Int16,num_classes,num_classes)
    for i in 1:y_len
        conf_mat[pred[i],y[i]] += 1
    end
    positives = []
    for j in 1:num_classes
        push!(positives,conf_mat[j,j])
    end
    recall = positives./sum(conf_mat,dims=1)'
    precision = positives./sum(conf_mat,dims=2)
    f1_score =  2*(precision.*recall)./(precision.+recall)
    mean(f1_score),mean(precision),mean(recall)
end


y = vec_labels(sentiment_labels)
pred_y  = pred_sentiment(sentiment_data)
sentiment_f1_score, sentiment_precision, sentiment_recall =  calculate_metrics(pred_y,y,4)



println("sentiment f1 score: $sentiment_f1_score")
println("sentiment precision: $sentiment_precision")
println("sentiment sentiment recall: $sentiment_recall")

sentiment f1 score: 0.21899710633845815
sentiment precision: 0.24714519576778163
sentiment sentiment recall: 0.2553106750175261


In [42]:
# Testing bleu and rogue metrics for machine translation task later on.

ref_corpus = [["Example of bleu score"], ["This is an apple"]]
translated_corpus = ["Example to bleu score", "This no a apple"]
bleu_result = bleu_score(ref_corpus,translated_corpus)
println("bleu score: $(bleu_result[1])")

hypothesis = ["Example for bleu score", "This cz an apple"]
ref_corpus = ["Example of bleu score", "This is an apple"]
println("rogue score dict:")
rouge_out = rouge(hypothesis, ref_corpus)


bleu score: 0.7253666236200924
rogue score dict:


OrderedCollections.OrderedDict{String, Float64} with 9 entries:
  "rouge_1 / f_score" => 0.75
  "rouge_1 / r_score" => 0.75
  "rouge_1 / p_score" => 0.75
  "rouge_2 / f_score" => 0.333333
  "rouge_2 / r_score" => 0.333333
  "rouge_2 / p_score" => 0.333333
  "rouge_l / f_score" => 0.75
  "rouge_l / r_score" => 0.75
  "rouge_l / p_score" => 0.75