In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm_notebook
import os
import numpy as np
from datetime import datetime

In [4]:
# #to Google Collab
# from google.colab import drive
# drive.mount('/content/gdrive')
# path_to_storage = '/content/gdrive/My Drive/UCU-diploma/openNMT'

In [5]:
# !git clone https://github.com/OpenNMT/OpenNMT-py.git OpenNMT
# !cd ./OpenNMT    
# !pip install -r requirements.txt

In [6]:
dir_to_all = './data/opennmt/all_2'
dir_to_after = './data/opennmt/after'
def create_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

create_dir(dir_to_all)
create_dir(dir_to_after)

In [7]:
dtypes_br = {
    'match_id': 'int64',
    'team1': 'object',
    'team2': 'object',
    'name': 'object',
    'match_time': 'int64',
    'type': 'category',
    'minute': 'int64',
    'content': 'object',
    'message_time':'int64'
}
broadcasts = pd.read_csv('data/ods_broadcasts_201905301157.csv',
                 header=0, 
                 usecols=dtypes_br.keys(), 
                 skipinitialspace=True,
                 skip_blank_lines=True,
                 encoding='utf-8')
broadcasts.content.apply(str)
broadcasts = broadcasts.dropna(subset=['content'])
dtypes_ns = {
    'id': 'int64',
    'name': 'object',
    'ctime': 'int64',
    'body': 'object',
    'match_id': 'int64',
}

news = pd.read_csv('data/ods_match_news.csv',
                 header=0, 
                 usecols=dtypes_ns.keys(), 
                 skipinitialspace=True,
                 skip_blank_lines=True,
                 encoding='utf-8')
news = news.dropna(subset=['body'])
news = news[news.match_id != 787015]

In [44]:
def time_type_news(one_news, one_broadcasts):
    assert type(one_news) == pd.core.series.Series, "one_news should be Series"
    assert type(one_broadcasts) == pd.core.series.Series, "one_news should be Series"
    before = 'before'
    after = 'after'
    time_match = datetime.fromtimestamp(one_broadcasts.match_time)
    time_news  = datetime.strptime(one_news.ctime, "%Y-%m-%d %H:%M:%S")
    #print("time match: ", time_match)
    #print("time news: ", time_news)
    return before if time_news < time_match else after

In [45]:
def clean_news_text(text):
    return  re.sub("(&#(?:\d)*;)", "", re.sub("<.*?>", "", text)).replace("\n","").replace("\r","").replace("\x96","")

In [46]:
X_train, X_test = train_test_split(news.match_id.unique(), test_size=0.1, random_state=42)
X_train, X_val = train_test_split(X_train, test_size=0.11, random_state=42)
print("Len X_train: ", len(X_train))
print("Len X_val: ", len(X_val))
print("Len X_test: ", len(X_test))

Len X_train:  6143
Len X_val:  760
Len X_test:  767


In [52]:
def create_dataset(X, type_ds, to_dir):
    assert type_ds in ['test','train', 'valid']
    create_dir(to_dir)
    count_lines = 0
    src_file = "%s/%s_src.broad.txt"%(to_dir,type_ds)
    tgt_file = "%s/%s_tgt.news.txt"%(to_dir,type_ds)
    with(open(src_file, 'w')) as f_broad:
        with(open(tgt_file, 'w')) as f_news:
            for match_id in tqdm_notebook(X):
                #print("match_id: ", match_id)
                
                text_br = clean_news_text(" ".join(broadcasts[broadcasts['match_id']==match_id]['content']).lower())+"\n"
                try:
                    cur_one_broadcast = broadcasts[broadcasts['match_id']==match_id].iloc[0]
                except:
                    print(match_id) 
                news_scope = news[news.match_id==match_id]
                min_lenght = np.inf
                min_idx = 0
                for i in range(len(news_scope)):
                    cur_news = news_scope.iloc[i]
                    ttn = time_type_news(cur_news, cur_one_broadcast)
                    if ttn == 'after':
                        cur_len = len(clean_news_text(cur_news.body))
                        #print("cur_len: ", cur_len, " cur_idx: ", i)
                        if cur_len < min_lenght:
                            min_lenght = cur_len
                            min_idx = i
                #print("Final len: ", min_lenght, " Final idx: ", min_idx)
                text_news = clean_news_text(news_scope.iloc[min_idx].body.lower())+"\n"
                #print("Fact len: ", len(text_news))
                f_broad.write(text_br)
                f_news.write(text_news)
                count_lines +=1
    print("Source file: ", src_file)
    print("Target file: ", tgt_file)
    print("Count lines: ",count_lines)

In [55]:
print("Build train dataset")
create_dataset(X_train, 'train', "./data/opennmt/all_2")
print("Build test dataset")
create_dataset(X_test, 'test', "./data/opennmt/all_2")
print("Build valid dataset")
create_dataset(X_val, 'valid', "./data/opennmt/all_2")

Build train dataset


HBox(children=(IntProgress(value=0, max=6143), HTML(value='')))

Source file:  ./data/opennmt/all_2/train_src.broad.txt
Target file:  ./data/opennmt/all_2/train_tgt.news.txt
Count lines:  6143
Build test dataset


HBox(children=(IntProgress(value=0, max=767), HTML(value='')))

Source file:  ./data/opennmt/all_2/test_src.broad.txt
Target file:  ./data/opennmt/all_2/test_tgt.news.txt
Count lines:  767
Build valid dataset


HBox(children=(IntProgress(value=0, max=760), HTML(value='')))

Source file:  ./data/opennmt/all_2/valid_src.broad.txt
Target file:  ./data/opennmt/all_2/valid_tgt.news.txt
Count lines:  760


In [57]:
!wc -l ./data/opennmt/all_2/train_src.broad.txt
!wc -l ./data/opennmt/all_2/train_tgt.news.txt
!wc -l ./data/opennmt/all_2/test_src.broad.txt
!wc -l ./data/opennmt/all_2/test_tgt.news.txt
!wc -l ./data/opennmt/all_2/valid_src.broad.txt
!wc -l ./data/opennmt/all_2/valid_tgt.news.txt

    6143 ./data/opennmt/all_2/train_src.broad.txt
    6143 ./data/opennmt/all_2/train_tgt.news.txt
     767 ./data/opennmt/all_2/test_src.broad.txt
     767 ./data/opennmt/all_2/test_tgt.news.txt
     760 ./data/opennmt/all_2/valid_src.broad.txt
     760 ./data/opennmt/all_2/valid_tgt.news.txt


In [59]:
create_dir("./data/opennmt/data_2")

In [58]:
! python ./OpenNMT-py/preprocess.py \
-train_src ./dafta/opennmt/all_2/train_src.broad.txt \
-train_tgt ./data/opennmt/all_2/train_tgt.news.txt \
-valid_src ./data/opennmt/all_2/valid_src.broad.txt \
-valid_tgt ./data/opennmt/all_2/valid_tgt.news.txt \
-save_data ./data/opennmt/data_2/data \
-share_vocab \
-dynamic_dict \
-src_vocab_size 50000 \
-report_every 1000


In [None]:
! python ./OpenNMT/train.py \
-data "/content/gdrive/My Drive/UCU-diploma/openNMT/opennmt/data_2/data" \
-save_model "/content/gdrive/My Drive/UCU-diploma/openNMT/opennmt/data_2/model" \
-share_embeddings \
-batch_type tokens \
-batch_size 64 \
-valid_batch_size 2 \
-copy_attn \
-global_attention mlp \
-word_vec_size 128 \
-rnn_size 512 \
-layers 1 \
-encoder_type brnn \
-max_grad_norm 2 \
-dropout 0. \
-optim adagrad \
-adagrad_accumulator_init 0.1 \
-reuse_copy_attn \
-copy_loss_by_seqlength \
-bridge \
-seed 42 \
-gpu_ranks 0 \
-save_checkpoint_steps 10000 \
-train_steps 100000 \
-learning_rate 0.001 \
-report_every 1000 \
-valid_steps 1000

In [None]:
! python ./OpenNMT/translate.py \
-model "/content/gdrive/My Drive/UCU-diploma/openNMT/opennmt/data_shared/model_step_66000.pt" \
-src "/content/gdrive/My Drive/UCU-diploma/openNMT/opennmt/all/test_src.broad.txt" \
-tgt "/content/gdrive/My Drive/UCU-diploma/openNMT/opennmt/all/test_tgt (1).news.txt" \
-output "/content/gdrive/My Drive/UCU-diploma/openNMT/opennmt/all/pred.news.txt" \
-beam_size 10 \
-dynamic_dict \
-share_vocab \
-batch_size 2 \
-batch_type sents \
-gpu 0 \
-seed 42 \
--report_rouge



In [None]:
files2rouge ./data/opennmt/all/pred_news.broad.txt ./data/opennmt/all/test_tgt.news.txt