# NWPUMayDayMcm2018
# 2018西工大五一数模

# 相关资源
1. 赛题 <http://lxy.nwpu.edu.cn/info/1353/12029.htm>![baidu](../BackupSource/Pics/baidu.jpg)
2. 本渣重构的代码&文档 <https://github.com/Alex-Beng/NwpuMayDayMcm2018>

# 赛题

1. 通过利用120章回中主要人物名称出现的频率的不同，能否证明不同章回之间作者的异同；
2. 通过利用120章回中你感兴趣词语的词频（比如虚词或者常用高频词的词频）的不同，能否证明不同章回之间作者的异同；
3. 通过对词与词之间的相关性进行分析，能否证明不同章回之间作者的异同；
4. 除了上述三种方法以外，你是否有其他方法（比如语义分析等）来分析不同章回之间作者的异同？请建立模型并说明理由。

# 分析
0. 显然是NLP问题 or 用数学建模的角度：数据分析的题
1. 主要人名→需要分词，并以人名词频为特征进行分类
2. 感兴趣词→分词，并以感兴趣词为特征进行分类
3. 词与词相关性→向量相似度？
4. 语义→word2vec & doc2vec
5. 将各个问题都需要用到的功能封装成sdk

# 代码结构
![code struture](../BackupSource/Pics/代码结构.png)

## IO相关
## 文件可参考 项目路径/Code/sdk/IO.py

In [5]:

class IO:
    def __init__(self):
        pass
    
    def ReadFile(self, file_path):
        with open(file_path, mode='r', encoding='utf-8') as f:
            return f.readlines()

    def ReadFiles(self, file_path, perfix, min_idx, max_idx):
        file_list = []
        for i in range(min_idx, max_idx+1):
            file_list.append(self.ReadFile(('%s/%s-%d')%(file_path, perfix, i)))
        return file_list

    def WriteFile(self, data, file_path):
        with open(file_path, mode='w', encoding='utf-8') as f:
            f.write(data)
        
    def WriteFiles(self, data_list, file_path, perfix, min_idx, max_idx):
        for i, j in zip(range(min_idx, max_idx+1), data_list):
            self.WriteFile(j, "%s/%s-%d"%(file_path, perfix, i))

    def SaveArray(self, arrays, save_path):
        np.save(save_path, arrays)

    def ReadArray(self, save_path):
        return np.load(save_path)

if __name__ == "__main__":
    pass


## 文本预处理
## 文件可参考 项目路径/Code/sdk/TextProc.py

In [4]:
import jieba
import re

class TextProc():
    def __init__(self):
        pass
    def Divide2Chapter(self, raw_novel):
        # 通过正则匹配 第xx回 ，然后通过匹配结果的分段
        # 因为 第一回 之前的那段是标题作者etc，与正文无关，故删去
        divided_parts = re.split('第.*.回 ', raw_novel)
        del divided_parts[0]

        return divided_parts
    def RmPuntuation(self, raw_text):
        # 通过正则匹配中文&空白符，然后取反^即是标点符号
        # 然后使用 sub 函数替换成 ' '
        regex = re.compile("[^\u4e00-\u9fa5a-zA-Z0-9\s]")
        return regex.sub(' ', raw_text)
    def RmEnter(self, raw_text):
        # 回车类似删去标点
        # 须注意 Unix/Linix 中换行 \n 回车 \r
        regex = re.compile("[\r\n]")
        return regex.sub(' ', raw_text)
    def Divide2Word(self, raw_text):
        # 使用 jieba 库分词，关闭全模式
        divided_list = jieba.cut(raw_text, cut_all=False)
        return ' '.join(divided_list)

## 特征工程
## 文件可参考 项目路径/Code/sdk/Feature.py

In [6]:
import random
import numpy as np
import logging
import gensim

class FeatureDraw:
    def __init__(self):
        pass
    def CountWord(self, divided_text, chapter_idx):
        # 以词为 key ，频数为 value
        # 然后根据频数对 key 进行排序后返回
        word_freq = {}
        word_list = divided_text.split()

        for word in word_list:
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1
        return sorted(word_freq.items(), key=lambda k:k[1], reverse=True)
    
    def Word2Vec(self, sentences, save_path):
        model = gensim.models.Word2Vec(sentences)
        model.save(save_path)

    def Doc2Vec(self, sentences, save_path, file_handler):
        sentences=gensim.models.doc2vec.TaggedLineDocument(file_handler)
        model = gensim.models.Doc2Vec(sentences)
        model.save(save_path)

class DataProcessing:
    def __init__(self):
        pass
    def DivideDataset(self, data_list, testset_ratio = 0.25):
        # random 划分测试集，剩下为训练集合
        testset_size = len(data_list)*testset_ratio
        
        testset = set()
        trainset = set()

        while len(testset) < testset_size:
            lucky_dog = random.choice(data_list)
            testset.add(lucky_dog)
        
        trainset = set(data_list) - testset

        return testset, trainset
    def ZeroCenterd(self, raw_vecs):
        raw_vecs -= np.mean(raw_vecs, axis=0)
        return raw_vecs
    def Normalized(self, raw_vecs):
        raw_vecs /= np.std(raw_vecs, axis=0)
        return raw_vecs



## 分类分析
## 文件可参考 项目路径/Code/sdk/Classifier.py

In [7]:
import pickle
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans


class Classifier:
    def __init__(self, train_data, labels):
        self.train_data, self.test_data, self.train_label, self.test_label = train_test_split(train_data, labels, train_size=0.7, random_state=1)
        # self.train_data = scale(self.train_data)
        # self.test_data = scale(self.test_data)
        
        print(len(self.test_label))
        print(len(self.train_label))
        print(self.train_data.shape)
        
    def Train(self, save_path):
        # self.classifier = KMeans(n_clusters=6)
        # self.classifier = AdaBoostClassifier()
        # self.classifier = MLPClassifier(hidden_layer_sizes=(100, 100, 100)) 
        # self.classifier = GradientBoostingClassifier()
        # self.classifier = KNeighborsClassifier(n_neighbors=5)
        # self.classifier = RandomForestClassifier()
        # self.classifier = DecisionTreeClassifier()
        self.classifier = SVC(kernel='linear', decision_function_shape='ovr')
        # self.classifier = LinearSVC()

        self.classifier.fit(self.train_data, self.train_label)
        # self.classifier.fit(self.train_data)

        f = open(save_path, 'wb')
        pickle.dump(self.classifier, f)
        f.close()
    def GetAccuracy(self):
        print(self.classifier.score(self.train_data, self.train_label))
        print(self.classifier.score(self.test_data, self.test_label))

## 聚类分析
## 文件可参考 项目路径/Code/sdk/Cluster.py

In [8]:
import pickle
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

class Cluster:
    def __init__(self, scalars):
        self.scalars = scalars 
    def Cluster(self):
        kmeans_cluster = KMeans(n_clusters=6)
        kmeans_cluster.fit(self.scalars)
        self.labels = kmeans_cluster.labels_
    def ToPics(self):
        pca = PCA(n_components=2)
        two_dim_scalars = pca.fit_transform(self.scalars)
        for i in range(len(two_dim_scalars)):
            if i <=79:
                if self.labels[i] == 0:
                    plt.scatter(two_dim_scalars[i][0], two_dim_scalars[i][1], marker='x', c='yellow')
                else:
                    plt.scatter(two_dim_scalars[i][0], two_dim_scalars[i][1], marker='x', c='red')
            else:
                if self.labels[i] == 0:
                    plt.scatter(two_dim_scalars[i][0], two_dim_scalars[i][1], marker='o', c='yellow')
                else:
                    plt.scatter(two_dim_scalars[i][0], two_dim_scalars[i][1], marker='o', c='red')
                    

## 数据展示
## 文件可参考 项目路径/Code/sdk/DataPresentation.py

In [9]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


class DataPresentation:
    def __init__(self, scalars):
        self.scalars = scalars
    def LowDim(self, n_dim):
        pca=PCA(n_components=n_dim)
        return pca.fit_transform(self.scalars)

    def VecPic(self):
        pca=PCA(n_components=2)
        scalars = pca.fit_transform(self.scalars)

        former80=scalars[0:79,:]
        latter40=scalars[80:119,:]
        plt.scatter(former80[:,0], former80[:,1], marker='x')
        plt.scatter(latter40[:,0], latter40[:,1], marker='o')
        plt.show()

# 终于可以开始做问题了！！！

## 文件可参考 项目路径/Code/Solutions/One.py

# 接下来...
# ppt莫得做完...请代码发言八...
# 主讲已经佛了