

# 時系列ベクトル作成のパイプライン作成

# ライブラリ・関数の読み込み

In [1]:
# jupyter 関係
%matplotlib inline
%reload_ext autoreload

import os
import glob
import pickle
import logging
from collections import Counter
from multiprocessing import Pool

import pandas as pd
import numpy  as np
from tqdm import tqdm_notebook as tqdm

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>")) 

# カラム全表示
pd.set_option('max_columns',None)
pd.set_option("display.max_colwidth", 200)

# 小数点の設定
%precision 5
np.random.seed(20190524)

In [2]:
DATA_PATH = os.getenv("DATA_PATH", "/mnt/NAS0CAC8A/collaborations/dentsuPR2019/raw_tweet/")
MeCab_DICT_PATH=os.getenv("MeCab_DICT_PATH", "/usr/lib/mecab/dic/mecab-ipadic-neologd/")
TXT_DATA_NAME=os.getenv("TXT_DATA_NAME", "tokenized_tweets")
PREPROCESSED_DATA_PATH = os.getenv("PREPROCESSED_DATA_PATH",
                                    "/mnt/NAS0CAC8A/k-syo/DW2V/preprocessed_data/")
N_JOB = int(os.getenv("N_JOB", "3"))
WORD_FREQ_MIN = 35
DW2V_PATH = os.getenv("DW2V_PATH", "/mnt/NAS0CAC8A/k-syo/DW2V/")
PARAM_PATH = os.getenv("PARAM_PATH", "/home/k-syo/DynamicWordEmbedding/params/DW2V/")

In [3]:
os.environ["N_JOB"] = "3"
os.environ["WORD_FREQ_MIN"] = "35"
os.environ["DATA_PATH"] = "/mnt/NAS0CAC8A/collaborations/dentsuPR2019/raw_tweet/"
os.environ["MeCab_DICT_PATH"] = "/usr/lib/mecab/dic/mecab-ipadic-neologd/"
os.environ["TXT_DATA_NAME"] = "tokenized_tweets"
os.environ["PREPROCESSED_DATA_PATH"] = "/mnt/NAS0CAC8A/k-syo/DW2V/preprocessed_data/"
os.environ["SLACK_URL"] = "https://hooks.slack.com/services/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"

In [4]:
# Logger
LOGGER = logging.getLogger('JobLogging')
LOGGER.setLevel(10)
fh = logging.FileHandler('job.log')
LOGGER.addHandler(fh)
formatter = logging.Formatter('%(asctime)s:%(lineno)d:%(levelname)s:%(message)s')
fh.setFormatter(formatter)
LOGGER.info("job start")

# ツイートデータの準備

In [None]:
# 前処理
with do_job("preprocess tweet", LOGGER):
    from core.preprocess_tweet import preprocess_one_day_tweet

    TWEETS_PATHS = sorted(glob.glob(DATA_PATH+"alldata_20*"))
    TWEETS_PATHS = TWEETS_PATHS[210:]

    if not os.path.exists(PREPROCESSED_DATA_PATH+"tokenized_tweets"):
        os.mkdir(PREPROCESSED_DATA_PATH+"tokenized_tweets")

    with Pool(processes=N_JOB) as p:
        p.map(preprocess_one_day_tweet, TWEETS_PATHS)

In [16]:
TWEETS_PATHS = glob.glob(PREPROCESSED_DATA_PATH+"tokenized_tweets/*")
TWEETS_PATHS = sorted(TWEETS_PATHS)

In [None]:
save_dir = "/mnt/NAS0CAC8A/k-syo/DW2V/preprocessed_data/concated_tweets/"
for i  in range(len(TWEETS_PATHS) // 7):
    tweets = []
    for tweet_path in TWEETS_PATHS[7*i:7*(i+1)]:
            with open(tweet_path, mode="rb") as f:
                tweet = pickle.load(f)
            tweets.append(tweet)
    concated_tweet = pd.concat(tweets)
    date = TWEETS_PATHS[7*i][-17:-7]
    with open(save_dir+date+".pickle", mode="wb") as f:
        pickle.dump(concated_tweet, f)

# 単語集合を求める

In [4]:
from core.make_DW2V import make_unique_word2idx

In [5]:
# TWEETS_PATHS = glob.glob(PREPROCESSED_DATA_PATH+"tokenized_tweets/*")
TWEETS_PATHS = glob.glob(PREPROCESSED_DATA_PATH+"concated_tweets/*")

In [8]:
make_unique_word2idx(TWEETS_PATHS)

# 単語の共起のカウント

In [5]:
from core.make_DW2V import make_whole_day_co_occ_dict
from core.make_DW2V import make_one_day_co_occ_dict

In [6]:
TWEETS_PATHS = glob.glob(PREPROCESSED_DATA_PATH+"concated_tweets/*")
print(len(TWEETS_PATHS))

# 終わったものを取り除く
all_date = [tweet.split("/")[-1][-19:-7] for tweet in TWEETS_PATHS]

finished_tweets = glob.glob(PREPROCESSED_DATA_PATH+"co_occ_dict_word_count/*")
finished_date = [tweet.split("/")[-1][:-7] for tweet in  finished_tweets]

unfinished_date = [date for date in all_date if date not in finished_date]

TWEETS_PATHS = [PREPROCESSED_DATA_PATH+f"concated_tweets/{date}.pickle" for date in unfinished_date]
print(len(TWEETS_PATHS))

39
19


In [None]:
make_whole_day_co_occ_dict(TWEETS_PATHS)

# 時系列ごとにPPMIを計算

In [5]:
from core.make_DW2V import make_whole_day_ppmi_list

In [6]:
TWEETS_PATHS = glob.glob(PREPROCESSED_DATA_PATH+"concated_tweets/*")
DICTS_PATHS = sorted(glob.glob(PREPROCESSED_DATA_PATH+"co_occ_dict_word_count/*"))
print(len(TWEETS_PATHS))

# 終わったものを取り除く
all_date = [tweet.split("/")[-1][-19:-7] for tweet in TWEETS_PATHS]

finished_tweets = glob.glob(PREPROCESSED_DATA_PATH+"ppmi_list/*")
finished_date = [tweet.split("/")[-1][:-7] for tweet in  finished_tweets]

unfinished_date = [date for date in all_date if date not in finished_date]

TWEETS_PATHS = [PREPROCESSED_DATA_PATH+f"concated_tweets/{date}.pickle" for date in unfinished_date]
DICTS_PATHS = [PREPROCESSED_DATA_PATH+f"co_occ_dict_word_count/{date}.pickle" for date in unfinished_date]
print(len(TWEETS_PATHS))
print(len(DICTS_PATHS))

39
39
39


In [7]:
PATH_TUPLES = [(tweet_p, dict_p) for tweet_p, dict_p in zip(TWEETS_PATHS, DICTS_PATHS)]

In [None]:
make_whole_day_ppmi_list(PATH_TUPLES)

# PPMIから時系列embeddingを得る

In [None]:
from core.make_DW2V import make_DW2V

In [None]:
make_DW2V(PARAM_PATH+"params_0803.json")