# imports

In [None]:
import spacy
import codecs
from spacy.lang.zh import Chinese
from gensim.models import Word2Vec
import jieba
import pandas as pd
import string
import numpy as np
import re

from collections import Counter
import random
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('wiki.zh.vec')
#scikitlearn imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report


In [None]:
stopwords = codecs.open('stopwords-zh.txt', 'r', 'utf-8').read().split(',')


jieba_stop_words = [
    '的', '了', '和', '是', '就', '都', '而', '及', '與', 
    '著', '或', '一個', '沒有', '我們', '你們', '妳們', 
    '他們', '她們', '是否', '“','”','‘','’','？','\n',' ',
    '有','这','让','今年','目前','要','%'
]


In [None]:
def zng(paragraph):
    for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?’“”]?', paragraph, flags=re.U):
        yield sent

#  Model(Logistic Regression)

In [None]:
text_clf = Pipeline([ ('clf', LogisticRegression(
        random_state = 1,
        solver = 'saga',
        multi_class= 'ovr',
        max_iter=10000
    ))])

# Text Cleaner

In [None]:
from spacy.tokenizer import Tokenizer
nlp = Chinese()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
tokenizer = Tokenizer(nlp.vocab)

# text = u"你要去那我也 我 去爸“”“。大哥你今天要去学校吗“”“。"
def text_clean(text):
    collect = []
    doc = nlp(text)
    for token in doc:
        if token.text not in jieba_stop_words:
            collect.append(token.text)
    collect = ''.join(collect)
    return collect    



# Data

In [None]:
corpus_df = pd.read_csv("chinese_news.csv", usecols = ['tag','content','headline'], encoding = 'utf-8' )
corpus_df.head()

In [None]:
len(corpus_df)

In [None]:

#print(len(corpus_df))
corpus_df=corpus_df.dropna(subset=['tag','headline','content'])
#print(len(corpus_df))
corpus_df['text'] = corpus_df['headline']+" "+corpus_df['content']
corpus_df.head()
corpus_df.drop(columns = ['headline','content'])

In [None]:
corpus_df['text'] = corpus_df['text'].apply(text_clean)
corpus_df.head()

In [None]:
train_headline_df=corpus_df[corpus_df['tag']=='详细全文']
train_international_df=corpus_df[corpus_df['tag']== '国际' ]
train_domestic_df=corpus_df[corpus_df['tag']== '国内' ]
print(len(train_headline_df))
print(len(train_international_df))
print(len(train_domestic_df))

# Training Data

In [None]:
headline_df = train_headline_df[0:2000]
international_df = train_international_df[0:2000]
domestic_df = train_domestic_df[0:2000]

headline_df.reset_index(drop = True, inplace = True)
international_df.reset_index(drop = True, inplace = True)
domestic_df.reset_index(drop = True, inplace = True)

train_df=pd.concat([headline_df,international_df, domestic_df])

In [None]:
print(len(headline_df))
print(len(international_df))
print(len(domestic_df))

In [None]:
print(len(train_df))

# Test Data

In [None]:
test_headline_df = train_headline_df[2000:3000]
test_international_df = train_international_df[2000:3000]
test_domestic_df = train_domestic_df[2000:3000]

test_headline_df.reset_index(drop = True, inplace = True)
test_international_df.reset_index(drop = True, inplace = True)
test_domestic_df.reset_index(drop = True, inplace = True)

test_df=pd.concat([test_headline_df,test_international_df, test_domestic_df])

In [None]:
print(len(test_headline_df))
print(len(test_international_df))
print(len(test_domestic_df))
print(len(test_df))

In [None]:
#extracting columns for training
X = list(train_df['text'])
Y = list(train_df['tag'])
X_test = list(test_df['text'])
Y_test = list(test_df['tag'])

In [None]:
print(len(X))

# Sentence Tokenizer using Regex

In [None]:
def zng(paragraph):
    for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?’“”]?', paragraph, flags=re.U):
        yield sent
        

# get_vector (Hiearcheal)

In [None]:
#write get_vector function
#needs to take in a string
#seperate sentences into lists of strings
#vectorize list of strings
#take average of all vectors
#output stack of vectors as one vector
import ast
from spacy.tokenizer import Tokenizer
from spacy.lang.zh import Chinese
from spacy.tokens import Doc
nlp = Chinese()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
tokenizer = Tokenizer(nlp.vocab)

text = u"我很喜欢广州。广州是我最喜欢的中国城市。三年前，我飞到广州见朋友了。我们一起过中秋节"
def get_vector_ap(text):
    doc = nlp(text)
    sent_collect = []
    sent_vector_collect = []
    #get sentences with re
    sent_collect = list(zng(text))
#     print(list(sent_collect))
           
    for sent in sent_collect:
        word_vector_collect = []
#         print("sentence acquired:",sent, '\n')  
        for each_word in sent:   
            try:
                word_vector = model.wv[each_word]
#                 print("Size of Vector : {0}".format(len(word_vector)))                
                word_vector_collect.append(word_vector)
                            
#               print(sent_vector_collect)
            except KeyError:
#                 print("Token {0} NOT FOUND".format(each_word))
                each_word.replace("”"," ")
           
        word_vector_np_array = np.asarray(word_vector_collect)
        word_vector_mean = np.mean(word_vector_np_array, axis=0)
        assert word_vector_mean.shape == (300,), "I expected a 300 dimensional vector, but received: {0} for -{1}-".format(word_vector_mean.shape, sent)
        sent_vector_collect.append(word_vector_mean)
        
    sent_vector_np_array = np.asarray(sent_vector_collect)
#     print("Shape of Sent Vector Numpy Array : {0}".format(sent_vector_np_array.shape))
    
    doc_vector = np.mean(sent_vector_np_array, axis=0)
#     print("Shape of Doc Vector Numpy Array : {0}".format(doc_vector.shape))
    assert doc_vector.shape == (300,), "I expected a 300 dimensional vector"
    return doc_vector
    


In [None]:
X_train, X_val, Y_train, Y_val  = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=882)


In [None]:
print(X_train[0])
print(X_val[0])
print(Y_train[0])
print(Y_val[0])
# 


In [None]:
print(len(X_val))
print(len(X_train))
print(len(Y_train))
print(len(Y_val))

# Get Vectors (Training Set)

In [None]:
Vec_X_train = []

for i in X_train:
    i = i.strip()
    Vec_X_train.append(get_vector_ap(i))



In [None]:
len(Vec_X_train)

In [None]:
text_clf.fit(Vec_X_train,Y_train)

# Spacy_ops Tpkenizer

In [None]:
class spacy_ops_zh(object):
    def __init__(self):
        self.nlp = Chinese()
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
    def __call__(self, some_text):
        doc = self.nlp(some_text)
        return [token.text for token in doc]

# TFID CountVectorizer

In [None]:


from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(
max_features = 10,
tokenizer = spacy_ops_zh(),
ngram_range = (2,2)

)
vectors = cv.fit_transform(X_train,Y_train)
feature_names = cv.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
print(len(feature_names))

In [None]:
df.head()

In [None]:
print(feature_names)

In [None]:
print("\n\nFEATURES\n\n")
clf_features = feature_names
clf_coeffs_headline = text_clf.get_params()['clf'].coef_[2]
clf_coeffs_domestic = text_clf.get_params()['clf'].coef_[0]
clf_coeffs_international = text_clf.get_params()['clf'].coef_[1]
k = 25
highest_headline_features = clf_coeffs_headline.argsort()[-k:][::-1]
highest_domestic_features = clf_coeffs_domestic.argsort()[-k:][::-1]
highest_international_features = clf_coeffs_international.argsort()[-k:][::-1]







print(highest_headline_features)

print(highest_domestic_features)

print(highest_international_features)

# Get Vectors (Validation Set)

In [None]:
Vec_X_val = []
for i in X_val:
    i = i.strip()
    Vec_X_val.append(get_vector_ap(i))

In [None]:
predictions = text_clf.predict(Vec_X_val)

In [None]:
print(classification_report(y_true=Y_val, y_pred=predictions))