In [None]:
import pandas as pd
from pandas import Series,DataFrame

from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.tokenfilter import *
from janome.charfilter import *

import numpy as np #np.arangeを使う時
import codecs #辞書ファイルの読み込みの時

from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# データの読み込み

In [None]:
formated_customer_df = pd.read_csv('formated_customer_df.csv',
                                   index_col=0, usecols=[0,2,6,13,14],
                                   dtype='str')

In [None]:
'''
pre_customer_df = pd.read_csv('pre_customer_df.csv',
                              index_col=0, usecols=[0,2,6,13,14],
                              dtype='str')
'''

In [None]:
#pre_customer_df.reset_index(drop=True, inplace=True)

In [None]:
formated_customer_df

In [None]:
#pre_customer_df

# 形態要素解析

In [None]:
t = Tokenizer()

%%time
tokens = []

for i in range(len(formated_customer_df)):
    token = t.tokenize(formated_customer_df['analysis_text'][i], stream=False)
    tokens.append(token)

In [None]:
#[token.base_form for token in tokens[197] if token.part_of_speech.split(',')[0] in ['動詞','名詞', '形容詞', '副詞', '形容動詞']]

# フィルターの実装

In [None]:
class NumericReplaceFilter(TokenFilter):
    """
    名詞中の数(漢数字を含む)を全て0に置き換えるTokenFilterの実装
    """
    def apply(self, tokens):
        for token in tokens:
            parts = token.part_of_speech.split(',')
            if (parts[0] == '名詞' and parts[1] == '数'):
                token.surface = '0'
                token.base_form = '0'
                token.reading = 'ゼロ'
                token.phonetic = 'ゼロ'
            yield token

In [None]:
char_filters = [UnicodeNormalizeCharFilter(),
                RegexReplaceCharFilter('https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', ''),
                RegexReplaceCharFilter('\?', ''),
                RegexReplaceCharFilter('？', ''),
                #RegexReplaceCharFilter('。', '')] 
               ]
token_filters = [NumericReplaceFilter(), # 名詞中の漢数字を含む数字を0に置換
                 CompoundNounFilter(), # 名詞が連続する場合は複合名詞にする
                 POSKeepFilter(['名詞', '動詞', '形容詞', '副詞']),# '助動詞']), # 名詞・動詞・形容詞・副詞のみを取得する
                 LowerCaseFilter()] # 英字は小文字にする

analyzer = Analyzer(char_filters, t, token_filters)

In [None]:
text5 = '時は内乱の最中である。秘密基地を発った反乱軍の宇宙船団が、邪悪な銀河帝国軍に対して初の勝利を収めた。この戦いで、反乱軍のスパイは帝国軍の究極兵器の設計図を奪取することに成功する。それはデス・スターと呼ばれる、惑星をも完全に破壊できる力を持った巨大宇宙ステーションだった。設計図を受け取ったレイア姫は、人々を救い、銀河系に平和を取り戻すべく、自船で故郷へと向かうが、帝国軍の密使に発見されてしまったのだった・・・'
for token in analyzer.analyze(text5):
    print(token)

%%time
atokens = []
for i in range(len(formated_customer_df)):
    atoken = analyzer.analyze(formated_customer_df['analysis_text'][i], stream=False)
    atokens.append(atoken)

In [None]:
%%time
atokens = []
for i in range(len(formated_customer_df)):
    atoken = analyzer.analyze(formated_customer_df['analysis_text'][i])#, stream=True)
    atokens.append(atoken)

formated_customer_df['analysis_text'][18]

In [None]:
print(formated_customer_df[formated_customer_df['report_definition_name'] == '既存顧客問い合わせ対応（メール）'].index)

In [None]:
formated_customer_df[formated_customer_df['report_definition_name'].str.endswith('既存顧客問い合わせ対応（メール）')]

In [None]:
number = 1024

In [None]:
#pre_customer_df['analysis_text'][number]

In [None]:
print([token.base_form for token in analyzer.analyze(formated_customer_df['analysis_text'][number])])

In [None]:
#print([token.surface for token in tokens[number]])

In [None]:
#[i.base_form for i in atokens[11]]
for temp in analyzer.analyze(formated_customer_df['analysis_text'][number]):
    print(temp)

# 辞書の読み込み定義

In [None]:
# pn_ja.dicファイルから、単語をキー、極性値を値とする辞書を得る
def load_pn_dict():
    dic = {}
    
    with codecs.open('./pn_ja.dic', 'r', 'shift_jis') as f:
        lines = f.readlines()
        
        for line in lines:
            # 各行は"良い:よい:形容詞:0.999995"
            columns = line.split(':')      
            
            if float(columns[3]) > 0.7:
                  dic[columns[0]] = 1
            
            elif float(columns[3]) < -0.7:
                  dic[columns[0]] = -1
            
            else:
                  dic[columns[0]] = 0
            
    return dic

In [None]:
def load_meisi_dict():
    dic = {}
    
    with codecs.open('./pn.csv.m3.120408.dic', 'r', 'utf-8') as f:
        lines = f.readlines()
        
        for line in lines:
            columns = line.split("\t")
            
            if columns[1] == 'p':
                dic[columns[0]] = int(1)
            elif columns[1] == 'n':
                dic[columns[0]] = int(-1)
            else:
                dic[columns[0]] = int(0)
    return dic

In [None]:
def load_yogen_dict():
    dic = {}
    
    with codecs.open('./wago.121808.dic', 'r','utf-8') as f:
        lines = f.readlines()
        
        for line in lines:
            columns = line.split("\t")
            columns[1] = columns[1].strip()
            columns[1] = columns[1].replace(" ", "")  
            if columns[0] == "ネガ（評価）" or columns[0] == "ネガ（経験）":
                dic[columns[1]] = int(-1)
            else:
                dic[columns[1]] = int(1)
    return dic
            
            #"ポジ（評価）"　"ポジ（経験）"

 # 辞書の読み込み

In [None]:
# 感情極性対応表のロード
pn_dic = load_pn_dict()
meisi_dic = load_meisi_dict()
yogen_dic = load_yogen_dict()

In [None]:
pn_dic

# トークンにスコアを割り振る

In [None]:
def get_pn_scores(tokens, pn_dic):
    scores = []
    score_dic ={}
    
    for base_form in [token.base_form for token in tokens]:
        if base_form in pn_dic:
            scores.append(pn_dic[base_form])
            score_dic[base_form] = pn_dic[base_form]
    return scores, score_dic

text = '時は内乱の最中である。秘密基地を発った反乱軍の宇宙船団が、邪悪な銀河帝国軍に対して初の勝利を収めた。'\
    'この戦いで、反乱軍のスパイは帝国軍の究極兵器の設計図を奪取することに成功する。'\
    'それはデス・スターと呼ばれる、惑星をも完全に破壊できる力を持った巨大宇宙ステーションだった。'\
    '設計図を受け取ったレイア姫は、人々を救い、銀河系に平和を取り戻すべく、自船で故郷へと向かうが'\
    '帝国軍の密使に発見されてしまったのだった・・・'

#実装では複数のテキスト分かち書きを読み込む事前提の実装
pn_scores = []
pn_scores_dic = []

dics = [pn_dic, meisi_dic, yogen_dic]

for i, dic in enumerate(dics):
    a, b = get_pn_scores(analyzer.analyze(text), dic)
    pn_scores.append(a)
    pn_scores_dic.append(b)
    print(pn_scores_dic[i])

In [None]:
# トークンリストから極性値リストを得る
def get_pn_scores(tokens, dic):
    scores = []
    score_dic ={}
    
    for base_form in [token.base_form for token in tokens]:
        if base_form in dic:
            scores.append(dic[base_form])
            score_dic[base_form] = dic[base_form]
    return scores, score_dic

In [None]:
%%time
pn_scores_Toko = []
pn_scores_Toko_dic = []
for i in range(len(formated_customer_df['analysis_text'])):
    #pn_scores_Toko.append(get_pn_scores(tokens[i], pn_dic))
    a, b = get_pn_scores(analyzer.analyze(formated_customer_df['analysis_text'][i]), pn_dic)
    pn_scores_Toko.append(a)
    pn_scores_Toko_dic.append(b)

In [None]:
pn_scores_meisi = []
pn_scores_meisi_dic = []
for i in range(len(formated_customer_df['analysis_text'])):
    a, b = get_pn_scores(analyzer.analyze(formated_customer_df['analysis_text'][i]), meisi_dic)
    pn_scores_meisi.append(a)
    pn_scores_meisi_dic.append(b)

In [None]:
pn_scores_yogen = []
pn_scores_yogen_dic = []
for i in range(len(formated_customer_df['analysis_text'])):
    a, b = get_pn_scores(analyzer.analyze(formated_customer_df['analysis_text'][i]), yogen_dic)
    pn_scores_yogen.append(a)
    pn_scores_yogen_dic.append(b)

In [None]:
print(pn_scores_Toko_dic[0])

In [None]:
print(pn_scores_Toko[0])

In [None]:
print(pn_scores_meisi_dic[0])

In [None]:
print(pn_scores_yogen_dic[0])

# スコアを算出する

In [None]:
pn_score_Toko = []
for i in range(len(pn_scores_Toko)):
    if len(pn_scores_Toko[i]) == 0:
        pn_score_Toko.append(0)
    else:
        pn_score_Toko.append(sum(pn_scores_Toko[i])/len(pn_scores_Toko[i]))

In [None]:
pn_score_meisi = []
for i in range(len(pn_scores_meisi)):
    if len(pn_scores_meisi[i]) == 0:
        pn_score_meisi.append(0)
    else:
        pn_score_meisi.append(sum(pn_scores_meisi[i])/len(pn_scores_meisi[i]))

In [None]:
pn_score_yogen = []
for i in range(len(pn_scores_yogen)):
    if len(pn_scores_yogen[i]) == 0:
        pn_score_yogen.append(0)
    else:
        pn_score_yogen.append(sum(pn_scores_yogen[i])/len(pn_scores_yogen[i]))

In [None]:
len(pn_score_Toko)

In [None]:
len(pn_score_meisi)

In [None]:
len(pn_score_yogen)

# スコアをデータフレームにまとめる

In [None]:
score_df = pd.DataFrame(pn_score_Toko, columns=['pn_score_Toko'])

In [None]:
score_df['pn_score_meisi'] = pn_score_meisi

In [None]:
score_df['pn_score_yogen'] = pn_score_yogen

In [None]:
score_df.head()

In [None]:
formated_customer_df = formated_customer_df.join(score_df)

In [None]:
#pre_customer_df = pre_customer_df.join(score_df)

# 日時カラムをdayだけの型にする

In [None]:
formated_customer_df['created_at'] = pd.to_datetime(formated_customer_df['created_at'])
type(formated_customer_df.created_at[0])

In [None]:
formated_customer_df['created_at'] = formated_customer_df['created_at'].dt.date

In [None]:
#pre_customer_df['created_at'] = pd.to_datetime(pre_customer_df['created_at'])
#pe(pre_customer_df.created_at[0])

In [None]:
#pre_customer_df['created_at'] = pre_customer_df['created_at'].dt.date

# 問い合わせ件数をまとめる

In [None]:
freq_of_app_df = formated_customer_df.customer_name.value_counts().reset_index() #Series
freq_of_app_df.columns = ['customer_name', 'freq_of_app']

In [None]:
freq_of_app_df.head()

In [None]:
master_df = formated_customer_df.reset_index().merge(freq_of_app_df, on='customer_name', how='outer', sort=False).sort_values('index').set_index('index')

In [None]:
#master_df = pre_customer_df.reset_index().merge(freq_of_app_df, on='customer_name', how='outer', sort=False).sort_values('index').set_index('index')

pre_customer_df.reset_index().merge(freq_of_app_df, on='customer_name', how='outer', sort=False).sort_values('index').set_index('index')

# 東工大のスコアにオフセットを加える

In [None]:
print(master_df['pn_score_Toko'].mean())
if  master_df['pn_score_Toko'].mean() > 0:
    offset_toko = master_df['pn_score_Toko'].mean()
else:
    offset_toko = -1 * master_df['pn_score_Toko'].mean()

In [None]:
offset_toko

In [None]:
master_df['pn_score_Toko_offset'] = master_df['pn_score_Toko'] + offset_toko

# スコアの重み付け

In [None]:
%%time
total_score = []
offset_num = abs(pd.DataFrame.from_dict(pn_dic, orient='index').mean()[0])

def offset(n):
    return n + offset_num

for i in tqdm(range(len(pn_scores_Toko))):
    offset_score = list(map(offset, np.array(pn_scores_Toko)))

for i in tqdm(range(len(pn_scores_Toko))):
    value = np.array([sum(pn_scores_Toko[i]), sum(pn_scores_meisi[i]), sum(pn_scores_yogen[i])])
    weight = np.array([len(pn_scores_Toko[i]), len(pn_scores_meisi[i]), len(pn_scores_yogen[i])])

    if sum(weight) == 0:
        total_score.append(0)
    else:
        wt_avg = (value * weight).sum() / weight.sum()
        total_score.append(wt_avg)

In [None]:
pd.DataFrame(total_score)

In [None]:
master_df['wt_avg_score'] = pd.DataFrame(total_score)

# スコアの合計値

In [None]:
#master_df['total_score'] = master_df['pn_score_Toko_offset'] + master_df['pn_score_meisi'] + master_df['pn_score_yogen']

# マスター表

In [None]:
master_df

In [None]:
master_df.describe()

In [None]:
sns.distplot(master_df['pn_score_Toko'], bins=100)

In [None]:
sns.distplot(master_df['pn_score_Toko_offset'], bins=100)

In [None]:
sns.distplot(master_df['pn_score_meisi'], bins=100)

In [None]:
sns.distplot(master_df['pn_score_yogen'], bins=100)

In [None]:
sns.distplot(master_df['wt_avg_score'], bins=100)

# のスコア

In [None]:
master_df[master_df.customer_name == ""].plot(x='created_at', y='pn_score_Toko', figsize=(10,4), linestyle='--', marker='o')

In [None]:
sns.pointplot(x='created_at', y='pn_score_meisi', data=master_df[master_df.freq_of_app > 90],
              hue='customer_name', markers='', line=1, alpha=0.5,
             subplots=True)

In [None]:
master_df.query('freq_of_app > 90').plot(subplots=True, x='created_at', y='pn_score_Toko', legend=True, figsize=(10,4))

# スコアの推移

In [None]:
companys = ["", "", ]
#temp_df = master_df[(master_df.customer_name == companys[0])].sort_values('created_at').reset_index(drop=True)
cols = ['created_at', 'wt_avg_score','customer_name', 'total_history']
temp2_df = pd.DataFrame(index=[], columns=cols)
for company in companys:
    total_history = []
    total = 0
    temp_df = master_df[(master_df.customer_name == company)][['created_at', 'wt_avg_score', 'customer_name']].sort_values('created_at').reset_index(drop=True)
    for i in range(len(master_df[master_df.customer_name == company])):
        total += temp_df['wt_avg_score'].reset_index(drop=True )[i]
        total_history.append(total)
    temp_df['total_history'] = total_history
    temp2_df = pd.concat([temp2_df, temp_df], ignore_index=True)

In [None]:
temp_df

In [None]:
temp2_df.groupby('customer_name').plot(x='created_at', y='total_history',figsize=(10, 4))

In [None]:
temp_df.plot(x='created_at', y=['total_history', 'wt_avg_score'], figsize=(10,4), linestyle='--', marker='o')

In [None]:
company_name = '' #部分一致可能

In [None]:
total_history = []
total = 0
for i in range(len(master_df[master_df.customer_name == ""])):
    total += master_df[master_df.customer_name == ""]['wt_avg_score'].reset_index(drop=True )[i]
    total_history.append(total)
X = np.arange(len(total_history))
plt.plot(X, total_history)

In [None]:
master_df[master_df['customer_name'].str.contains(company_name, na=False)]

In [None]:
def plot_totalhistory(customer_name):
    total_history = []
    total = 0
    for i in range(len(master_df[master_df.customer_name == customer_name])):
        total += master_df[master_df.customer_name == customer_name]['wt_avg_score'].reset_index(drop=True )[i]
        total_history.append(total)
    temp_df = master_df[(master_df.customer_name == customer_name)].sort_values('created_at').reset_index(drop=True)
    temp_df['total_history'] = total_history
    temp_df.plot(x='created_at', y=['total_history', 'wt_avg_score'], figsize=(10,4), linestyle='--', marker='o')

In [None]:
plot_totalhistory(company_name)

# テキスト、スコアチェック

In [None]:
master_df

In [None]:
number = 340 #master_dfのindexを入力してshift + enter

In [None]:
master_df['analysis_text'][number]

In [None]:
print([token.base_form for token in analyzer.analyze(formated_customer_df['analysis_text'][number])])

In [None]:
print(pn_scores_Toko_dic[number])

In [None]:
print(pn_scores_meisi_dic[number])

In [None]:
print(pn_scores_yogen_dic[number])

dic = pn_scores_Toko_dic[number].copy() #実際に計算ではオフセットを加えてスコア付けしている
keys = list(pn_scores_Toko_dic[number].keys())
values = list(map(lambda x: x+offset_score, pn_scores_Toko_dic[number].values()))
dic.update(zip(keys, values))

In [None]:
master_df[master_df['report_definition_name'].str.endswith('')]

In [None]:
master_df.to_csv(".csv")