In [None]:
# import package
import pandas as pd
import numpy as np
import itertools
import nltk
from sklearn.data

In [None]:
class NBC():
    def __init__(self):
        self.mails = []
        self.tokens = set({})
        self.tokens_freq = pd.DataFrame([],columns=['token', 'spam', 'ham'], dtype='int32')
    # 딕셔너리 형태로 mail input
    def input_data(self, mail, mail_class):
        self.mails.append({mail_class: mail})
    # data 변형기
    def data_transformer(self, datas):
        word_column = datas.columns[0]
        target_column = datas.columns[1]
        target_ls = list(set(datas[target_column]))
        datas[word_column] = list(map(lambda i : i.split(','), datas[word_column]))
        total_ngram = list(itertools.chain(*list(datas[word_column]))) # list로 들어가지 않을 경우
        unique_ngram = list(set(total_ngram))
        result_df = pd.DataFrame(unique_ngram, columns = [word_column]).set_index(word_column)
        for target in target_ls:
            this_ngram = list(itertools.chain(*list(datas[datas[target_column] == target][word_column])))
            fdist = nltk.FreqDist(this_ngram)
            temp_df = pd.DataFrame(list(zip(fdist.keys(), fdist.values())), columns= [word_column, 'count']).set_index(word_column)
            result_df[target] = temp_df['count']
        self.df = result_df.fillna(0)
        return self.df
    # 모든 메일 토크나이징
    def tokenizer(self):
        # 토큰분리
        for mail in self.mails:
            mail_token = set(list(mail.values())[0].split())
            self.tokens = self.tokens.union(mail_token)
        # 분류별 토큰 빈도 수 계산
        for token in self.tokens:
            token_dict = {"token": token, 'spam':0, 'ham':0}
            for mail in self.mails:
                if token in list(mail.values())[0]:
                    mail_type = list(mail.keys())[0]
                    count = list(mail.values())[0].count(token)
                    token_dict[mail_type] += count
            self.tokens_freq = self.tokens_freq.append(token_dict, ignore_index=True)
    # 확률계산(n=train 횟수))
    def fit(self, n):
        # 평균을 내기 위한 데이터 저장소
        df_columns = self.tokens_freq.columns
        for_avg = self.tokens_freq.drop(columns=df_columns, axis = 1)
        # 데이터 스플릿
        for i in range(n):
            # 임의의 90% 데이터 추출
            train = self.tokens_freq.sample(frac=0.9)
            # fit 수행
            train['whokish'] = ((train[1])/(sum(train[1])))
            train['wdovish'] = ((train[-1])/(sum(train[-1])))
            train['score{}'.format(i)] = train['whokish']/train['wdovish']
            for_avg = pd.merge(for_avg, train['score{}'.format(i)], left_index=True, right_index=True, suffixes=('_',''), how='outer')
            try:
                for_avg.drop('score{}_'.format(i), axis=1, inplace=True)
            except:
                pass
        self.tokens_freq['avg_score'] = for_avg.mean(axis=1)
        print(self.tokens_freq.head(50))
    # 스팸 분류기
    def classify(self, mail):
        mail_token = mail.split()
        p_ham = 0
        p_spam = 0
        for token in mail_token:
            if token in self.tokens:
                row = self.tokens_freq.loc[self.tokens_freq['token'] == token]
                p_ham += float(row.wham)
                p_spam += float(row.wspam)
        p_ham = np.exp(p_ham)
        p_spam = np.exp(p_spam)
        result = p_spam / (p_ham + p_spam)
        return result


In [None]:
# 의사록 데이터
test_data = pd.read_json(r'C:\Users\student\Desktop\newsdata\test_ngram_datas.json')
test_data['ngram'] = list(map(lambda i : i.split(','), test_data['ngram']))
test_data['date'] = list(map(lambda i : i.date(), test_data['date']))
test_data = test_data[test_data['date']<= datetime.date(2017,12,31)]
# hawkish, dovish 사전 필요
# 0으로 나눠지는 값의 의미 = hawkish, dovish 사전에 아예 없음을 의미함
# 해당 부분은 드랍하는 것이 맞을 듯
def tone_sent(x):
    a = 0
    b = 0
    for ngram in x:
        if ngram in ppp[ppp['avg_score']>1.3].index:
            a += 1
        elif ngram in ppp[ppp['avg_score']<(10/13)].index:
            b += 1
    try:
        return (a-b) / (a+b)
    except:
        return np.nan
test_data['tone'] = list(map(tone_sent, test_data['ngram']))
test_data.dropna(inplace=True)
# 0은 중립
test_data['HD'] = list(map(lambda i : 'H' if i > 0 else 'D' if i < 0 else np.nan, test_data['tone']))
test_data.dropna(inplace=True)
test_data['H'] = list(map(lambda i : 1 if i == 'H' else 0, test_data['HD']))
test_data['D'] = list(map(lambda i : 1 if i == 'D' else 0, test_data['HD']))
final_tone = test_data.groupby('date').sum()[['H','D']]
final_tone['tone'] = (final_tone['H'] - final_tone['D']) / (final_tone['H'] + final_tone['D'])
sr_df = pd.read_json(r'C:\Users\student\Desktop\newsdata\rate\standard_rate.json').set_index('date')
final_tone['rate'] = sr_df['rate']
corr = final_tone[['tone','rate']].corr(method = 'pearson')
print(corr)
import matplotlib.pyplot as plt
plt.plot(final_tone['tone'])
def norm(x):
    if x - final_tone['rate'].mean() > 0:
        (x - final_tone['rate'].mean()) / (final_tone['rate'].max() - final_tone['rate'].mean())
    else:
        (x - final_tone['rate'].mean()) / (final_tone['rate'].mean() - final_tone['rate'].min())
final_tone['norm_rate'] = list(map(norm, final_tone['rate']))
plt.plot(final_tone['norm_rate'])
plt.show()