In [1]:
from dataset.persent import PerSenTDataset
from dataset.multiemo import MultiEmoDataset
from os.path import join, isfile
import pandas as pd
from tqdm import tqdm

In [2]:
datadir = "../data/PerSenT"
train_filepath = "train.csv"
val_filepath = "dev.csv"
test_filepath = "fixed_test.csv"

dataset = PerSenTDataset(join(datadir,train_filepath))
df_persent = pd.DataFrame({"text":dataset.X, "label": dataset.Y})
df_persent

Unnamed: 0,text,label
0,Germany's Landesbank Baden Wuertemberg won EU ...,Negative
1,The Philippine National Police (PNP) identifie...,Neutral
2,Sirleaf 70 acknowledged before the commissio...,Negative
3,Sawyer logged off and asked her sister Mari ...,Neutral
4,Candi Holyfield said in the protective order t...,Neutral
...,...,...
3350,The Tampa Bay Times reports that 32-year-old ...,Neutral
3351,In a full-throated denunciation of the work ...,Neutral
3352,(CNN) The elevation of Emmerson Mnangagwa to t...,Negative
3353,What is known for sure about American militar...,Positive


In [3]:
dataset_val = PerSenTDataset(join(datadir,val_filepath))
df_val_persent = pd.DataFrame({"text":dataset_val.X, "label": dataset_val.Y})
df_val_persent

Unnamed: 0,text,label
0,In 2006 Benjamin Koellmann bought a condomini...,Neutral
1,Lugo a former Catholic bishop who assumed off...,Positive
2,Spanish Wimbledon winner Rafael Nadal said Sun...,Positive
3,In a letter posted on the White House web site...,Positive
4,TAMPA At least Raheem Morris finally has the ...,Positive
...,...,...
573,In the space of four days Harvey Weinstein ...,Neutral
574,Weâll get to the merits of the charges and c...,Negative
575,Russia âs president Vladimir Putin wanted t...,Negative
576,All five living former US presidents are teami...,Negative


In [4]:
dataset_test = PerSenTDataset(join(datadir,test_filepath))
df_test_persent = pd.DataFrame({"text":dataset_test.X, "label": dataset_test.Y})
df_test_persent

Unnamed: 0,text,label
0,term extension of government funding that woul...,Neutral
1,At a press conference in June a reporter ask...,Neutral
2,Trump Xi present united front despite differ...,Positive
3,The Latest on President Donald Trump in Florid...,Neutral
4,“ I ’ m thrilled with the progress that Presid...,Positive
...,...,...
822,"WASHINGTON (AP) â Donald Trump 's ""opaque"" ...",Negative
823,President Donald Trump's rally in Missouri thi...,Positive
824,WASHINGTON (Reuters) - Major automakers suppl...,Positive
825,President Donald Trump says the Justice Depar...,Negative


In [5]:
dataset.class_proportion(), dataset_val.class_proportion(), dataset_test.class_proportion()

(tensor([0.5240, 0.1046, 0.3714]),
 tensor([0.5260, 0.1003, 0.3737]),
 tensor([0.4450, 0.1681, 0.3869]))

In [6]:
full = pd.concat([df_persent, df_val_persent, df_test_persent], axis=0)
full

Unnamed: 0,text,label
0,Germany's Landesbank Baden Wuertemberg won EU ...,Negative
1,The Philippine National Police (PNP) identifie...,Neutral
2,Sirleaf 70 acknowledged before the commissio...,Negative
3,Sawyer logged off and asked her sister Mari ...,Neutral
4,Candi Holyfield said in the protective order t...,Neutral
...,...,...
822,"WASHINGTON (AP) â Donald Trump 's ""opaque"" ...",Negative
823,President Donald Trump's rally in Missouri thi...,Positive
824,WASHINGTON (Reuters) - Major automakers suppl...,Positive
825,President Donald Trump says the Justice Depar...,Negative


In [7]:
lengths = full["text"].apply(lambda x: len(x.split(" ")))
lengths.mean(), lengths.std()

(376.84558823529414, 392.6334206972245)

In [8]:
persent_class_quantity = dataset.class_proportion().min()*len(dataset)
persent_class_quantity

tensor(351.)

In [9]:
persent_short = df_persent.groupby('label').apply(lambda x: x.sample(int(persent_class_quantity))).reset_index(drop=True)
persent_short

Unnamed: 0,text,label
0,Bernie Sanders is . \nDemocrats have rallied t...,Negative
1,Many fascinating ― and at times disturbing...,Negative
2,DUBAI (Reuters) - Yemenâs Houthi group has b...,Negative
3,Nearly 10 months Â­after a federal probe into ...,Negative
4,Democrats on Wednesday questioned again whethe...,Negative
...,...,...
1048,IMF chief mission Thomas R. Rumbaugh said at t...,Positive
1049,Story highlights The source said Miller has ...,Positive
1050,Even after Senate Republicans scrapped the vot...,Positive
1051,Mr. Kasanoff started Threshold in 1992. Former...,Positive


In [10]:
persent_short.to_csv(join(datadir, "short_train.csv"), index=False)

In [11]:
datadir2 = "../data/multiemo2"
train_filepath2 = "all.text.train.en.txt"
val_filepath2 = "all.text.dev.en.txt"
test_filepath2 = "all.text.test.en.txt"
dataset2 = MultiEmoDataset(join(datadir2,train_filepath2))
df_multiemo = pd.DataFrame({"text":dataset2.X, "label": dataset2.Y})
df_multiemo

Unnamed: 0,text,label
0,"At the very entrance, the hotel stinks. There ...",minus_m
1,With my children (10 years old and 2 years old...,plus_m
2,I wanted to get 'em a set for basic work somet...,minus_m
3,Dear Mom. I have the same opinion about this b...,minus_m
4,"Actually, I didn't find out anything to em, du...",minus_m
...,...,...
6567,"Hotel with very good food, once you get there ...",plus_m
6568,I usually give few reviews because I don't hav...,plus_m
6569,It is hard to believe that at the fingertips o...,minus_m
6570,I was with this doctor because my attending do...,minus_m


In [12]:
val_dataset2 = MultiEmoDataset(join(datadir2,val_filepath2))
df_val_multiemo = pd.DataFrame({"text":val_dataset2.X, "label": val_dataset2.Y})
df_val_multiemo

Unnamed: 0,text,label
0,I recommend the Scaliano Spa with all responsi...,plus_m
1,"Typical family hotel. Spacious apartments, wit...",plus_m
2,"The bottom ! ! ! A muddy swimming pool, black ...",minus_m
3,"Hotel itself, service, food revelation. Additi...",plus_m
4,Bioethicist prof . Jacek Hołówka from the Comm...,zero
...,...,...
818,"Dr. Szeliga knows her stuff, calmly and accura...",plus_m
819,"When a friend recommended me to Mrs. Karolina,...",plus_m
820,I don't know how you can take that doctor's ha...,minus_m
821,"From the first moment you stay here, you can f...",plus_m


In [13]:
test_dataset2 = MultiEmoDataset(join(datadir2,test_filepath2))
df_test_multiemo = pd.DataFrame({"text":test_dataset2.X, "label": test_dataset2.Y})
df_test_multiemo

Unnamed: 0,text,label
0,I am not able to assess the doctor's expertise...,minus_m
1,"Great location, access to the fair is 10 minut...",plus_m
2,It was a wedding anniversary trip. The staff w...,plus_m
3,It's been a long time since I could see a psyc...,plus_m
4,A hotel just in time for a one-hour stay. The ...,plus_m
...,...,...
815,"""Diabetes mellitus is a disease that was consi...",zero
816,I give this opinion to the Doctor after his 3r...,minus_m
817,Hotel in the city centre close to the centre e...,amb
818,A very good specialist ` ` The rating has been...,plus_m


In [14]:
dataset2.class_proportion(), val_dataset2.class_proportion(), test_dataset2.class_proportion()

tensor([0.2774, 0.3757, 0.1477, 0.1992])
tensor([0.2868, 0.3694, 0.1555, 0.1883])
tensor([0.2768, 0.4134, 0.1439, 0.1659])


(tensor([0.2774, 0.3757, 0.1477, 0.1992]),
 tensor([0.2868, 0.3694, 0.1555, 0.1883]),
 tensor([0.2768, 0.4134, 0.1439, 0.1659]))

In [15]:
multiemo_class_quantity = int(dataset2.class_proportion().min()*len(dataset2))
multiemo_class_quantity

tensor([0.2774, 0.3757, 0.1477, 0.1992])


971

In [16]:
multiemo_short = df_multiemo.groupby('label').apply(lambda x: x.sample(multiemo_class_quantity)).reset_index(drop=True)
multiemo_short

Unnamed: 0,text,label
0,"She had a chance to spend New Year's Eve in "" ...",amb
1,The hotel is located a large piece from the ce...,amb
2,I rate the weekly stay in this hotel as good 4...,amb
3,"If cigarette smoke bothers you, do not choose ...",amb
4,The hotel is located in the heart of the city....,amb
...,...,...
3879,"During the conference entitled ""The future of ...",zero
3880,"Part of this research is part of the project ""...",zero
3881,"In connection with this jubilee, on Wednesday ...",zero
3882,"According to the university authorities, this ...",zero


In [17]:
full = pd.concat([df_multiemo, df_val_multiemo, df_test_multiemo], axis=0)
full

Unnamed: 0,text,label
0,"At the very entrance, the hotel stinks. There ...",minus_m
1,With my children (10 years old and 2 years old...,plus_m
2,I wanted to get 'em a set for basic work somet...,minus_m
3,Dear Mom. I have the same opinion about this b...,minus_m
4,"Actually, I didn't find out anything to em, du...",minus_m
...,...,...
815,"""Diabetes mellitus is a disease that was consi...",zero
816,I give this opinion to the Doctor after his 3r...,minus_m
817,Hotel in the city centre close to the centre e...,amb
818,A very good specialist ` ` The rating has been...,plus_m


In [20]:
lengths = full["text"].apply(lambda x: len(x.split(" ")))
lengths.mean(), lengths.std(), lengths.max()

(140.1794278758369, 79.58714959545522, 951)

In [19]:
# multiemo_short.to_csv(join(datadir2, "all.text.short_train.en.csv"), index=False)