In [4]:
# import jieba
import os
import pickle
import pandas as pd
import re
import copy
import numpy as np
from functools import reduce
from datetime import datetime, timedelta
from sklearn.cluster import KMeans, MiniBatchKMeans,DBSCAN
from pymongo import MongoClient
import pprint
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity
import jieba.posseg as pseg

Load Data

In [5]:
with open("news_all_with_event","rb") as fp:
    a = pickle.load(fp)

a = a.reset_index(drop=True)
a["org_index"] = a.index
contents = list(a["content"])
a["datetime"] = pd.to_datetime(a["datetime"])
datetimes = list(a["datetime"])


In [2]:
# Validation
with open("valid_news_all_with_event","rb") as fp:
    a = pickle.load(fp)

a = a.reset_index(drop=True)
a["org_index"] = a.index
contents = list(a["content"])
a["datetime"] = pd.to_datetime(a["datetime"])
datetimes = list(a["datetime"])

Jieba Function define

In [1]:
### 載入停用字檔案，並做成一個list
stopwords_list = [line.strip() for line in open('stopwords.txt',"r").readlines()]

### 載入同義字檔案，並做成一個字典
syn_dict = {}
with open("syn.txt","r") as f :
    for line in f:
        for word in line.strip("\n").split("\t")[1:]:
            syn_dict[word] = line.strip("\n").split("\t")[0]

### 載入中文斷詞字典
jieba.set_dictionary("dict.txt")

In [14]:
def stopwords(w):
    if w not in stopwords_list:
        return w

def syn(w):
    if w in syn_dict.keys():
        w=syn_dict[w]
        return w
    else:
        return w

def cut(news):
    w = jieba.cut(news, cut_all=False ,HMM=True)
    return w

def regular(w):
    line = re.findall('[\u4e00-\u9fa5]+', w)
    if len(line) > 0:
        return line

In [15]:
def text_cleaning(paragraph):
    me_words = []
    words = cut(paragraph) 
    for w in words:
        if w.startswith("一"or"三"or"四"or"五"or"六"or"七"or"八"or"九"or"十") == False :
            if len(w)>1:
                w = regular(w)
                if w is not None:
                    w_stopwords = stopwords(w[0])
                    if w_stopwords is not None:
                        w_syn = syn(w_stopwords)
                        me_words.append(w_syn)
    return " ".join(me_words)

In [16]:
def cleaned_news(news_list):
    cleaned_news_list = []
    for news in copy.deepcopy(news_list):
        if 'message' in news:
            news["message"] = text_cleaning(news["message"])
            cleaned_news_list.append(news)
    return cleaned_news_list

In [17]:
def noun_cleaned(content):   
    words = [(word, flag) for word, flag in pseg.cut(content)]
    noun_list = [w[0] for w in words if w[1]=="n"]
    me_words=[]
    for w in noun_list:
        if w.startswith("一"or"三"or"四"or"五"or"六"or"七"or"八"or"九"or"十") == False :
            if len(w)>1:
                w = regular(w)
                if w is not None:
                    w_stopwords = stopwords(w[0])
                    if w_stopwords is not None:
                        w_syn = syn(w_stopwords)
                        me_words.append(w_syn)
    return " ".join(me_words)

Finding Important Word in every Article

In [2]:
news_important_noun=[]
for content in contents:
    words = [(word, flag) for word, flag in pseg.cut(content)]
    noun_list = [w[0] for w in words if w[1]=="n"]
    me_words=[]
    for w in noun_list:
        if w.startswith("一"or"三"or"四"or"五"or"六"or"七"or"八"or"九"or"十") == False :
            if len(w)>1:
                w = regular(w)
                if w is not None:
                    w_stopwords = stopwords(w[0])
                    if w_stopwords is not None:
                        w_syn = syn(w_stopwords)
                        me_words.append(w_syn)
    news_important_noun.append(" ".join(me_words))

In [4]:
with open('news_all_important_noun',"rb") as f:
    # pickle.dump(news_important_noun,f)
    news_important_noun = pickle.load(f)


In [5]:
from  sklearn.feature_extraction.text  import  CountVectorizer  

vectorizer = CountVectorizer()  
X = vectorizer.fit_transform(news_important_noun)  
words = vectorizer.get_feature_names()  

from  sklearn.feature_extraction.text  import  TfidfTransformer  

transformer = TfidfTransformer()  
tfidf = transformer.fit_transform(X)  
# print(tfidf)

DBSCAN process TFIDF Matric

In [8]:
# 1st DBSCAN
db = DBSCAN(n_jobs=-1,eps=0.7, min_samples=3,metric="euclidean")

In [None]:
db.fit(tfidf.toarray())

In [5]:
# 確認各個cluster有多少資料
a["tag"] = pd.Series(db.labels_)
labels = list(set(db.labels_))
sum = 0
for i in set(labels):
    findtag = (a["tag"] == labels[i])
    sum += len(a[findtag])
    print(len(a[findtag]))
print(sum)
# 確認有幾個cluster
# print(labels)


In [4]:
labels = list(set(db.labels_))
a["tag"] = pd.Series(db.labels_)
a["subtag"] = -1
for i in [ x for x in labels if x != -1]:
    findtag = (a["tag"] == labels[i])
    cluster_pd = pd.DataFrame(a[findtag])
    cluster_time_list = []
    for index, row in cluster_pd.iterrows():
         cluster_time_list.append([(row['datetime']-
                                  list(cluster_pd['datetime'])[0]).days])

    cluster_array = np.array(cluster_time_list)
    # 可以看每一個相對時間
#     cluster_time_list.sort()
    print(cluster_time_list)
    # # -----------------2nd DBSCAN----------------------------------------
    cluster_db = DBSCAN(n_jobs=-1,eps=3, min_samples=2)
    cluster_db.fit(cluster_array)
    sub_labels = list(cluster_db.labels_)
    count = 0
    for index, row in cluster_pd.iterrows():
        a.iloc[index,14] = sub_labels[count]
        count += 1
    print("insert"+str(i)+"cluster subtag")


In [37]:
with open("news_with_event_eps0.7_with_tags_3days","wb")as f:
    pickle.dump(a,f)