In [2]:
import pandas as pd

In [51]:
data = pd.read_csv('./data/new_data.csv', encoding='utf8')
data.head()

Unnamed: 0,1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme,nl
0,1 millón de afectados ante las inundaciones en...,es
1,1 millón de fans en facebook antes del 14 de f...,es
2,1 satellite galileo sottoposto ai test presso ...,it
3,10 der welt sind bei,de
4,10 jaar voor overval op juwelier bejaard echtp...,nl


### 查看语种分布情况

In [52]:
data['nl'].value_counts()

es    1562
fr    1551
it    1539
en    1505
ch    1500
de    1479
nl    1429
Name: nl, dtype: int64

### 切分训练集和测试集

In [53]:
from sklearn.model_selection import train_test_split
X, y = data['1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme'], data['nl']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

### 文本表示模型构建

**词袋模型+n-gram模型**

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    lowercase=True,  # 英文文本全小写
    analyzer='char_wb',  # 逐个字母解析
    ngram_range=(1, 3),  # 1=出现的字母以及每个字母出现的次数，2=出现的连续2个字母，和连续2个字母出现的次数
    # trump images are now... => 1gram=t,r,u,m,p... 2gram=tr,ru,um,mp...
    max_features=1000,   # keep the most common 1000 ngrams
)
vec.fit(X_train)

CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [55]:
result = vec.transform(["I love machine learning.I think this is a magic field"])
result

<1x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 74 stored elements in Compressed Sparse Row format>

In [56]:
print(vec.get_feature_names())
result.toarray()

[' ', ' 1', ' 2', ' 20', ' 3', ' a', ' a ', ' ab', ' ac', ' af', ' al', ' am', ' an', ' ap', ' ar', ' as', ' at', ' au', ' av', ' b', ' ba', ' be', ' bi', ' bl', ' bo', ' br', ' bu', ' c', ' ca', ' ce', ' ch', ' ci', ' cl', ' co', ' cr', ' d', ' da', ' de', ' di', ' do', ' du', ' dé', ' e', ' e ', ' ee', ' eg', ' ei', ' el', ' en', ' er', ' es', ' et', ' f', ' fa', ' fe', ' fi', ' fo', ' fr', ' g', ' ga', ' ge', ' gi', ' go', ' gr', ' h', ' ha', ' he', ' hi', ' ho', ' i', ' i ', ' il', ' im', ' in', ' is', ' it', ' j', ' ja', ' je', ' jo', ' k', ' ka', ' ki', ' ko', ' l', ' la', ' le', ' li', ' lo', ' m', ' ma', ' me', ' mi', ' mo', ' mu', ' n', ' na', ' ne', ' ni', ' no', ' nu', ' o', ' of', ' on', ' op', ' ou', ' p', ' pa', ' pe', ' pi', ' pl', ' po', ' pr', ' pu', ' q', ' qu', ' r', ' ra', ' re', ' ri', ' ro', ' s', ' sa', ' sc', ' se', ' si', ' so', ' sp', ' st', ' su', ' t', ' ta', ' te', ' th', ' ti', ' to', ' tr', ' tu', ' tw', ' u', ' un', ' us', ' v', ' va', ' ve', ' vi', ' vo

array([[20,  0,  0,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  2,  1,  0,  0,  0,  1,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  2,  0,  1,  0,  1,  2,  2,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  2,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  4,  1,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,  0,
         0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

### 构建分类模型
#### 朴素贝叶斯

In [57]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(X_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [58]:
classifier.score(vec.transform(X_test), y_test)

0.9909159727479182

### 清洗一下数据，看下效果

In [36]:
import re
def remove_noise(document):
    noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+", "\d+", "，", "！", "。", '“', '”', "？", "\.+"]))
    clean_text = re.sub(noise_pattern, "", document)
    
    return clean_text.strip()


print(remove_noise("123Trump images are now more popular than cat gifs. @trump #trends http://www.trumptrends.html"))
print(remove_noise("我爱你，中国！China520"))
    

Trump images are now more popular than cat gifs
我爱你中国China


In [41]:
# 清理中文中的英文字母
def clean_chinese(document):
    pattern = re.compile('[A-Za-z]')
    clean_text = re.sub(pattern, "", document)
    return clean_text.strip()
print(clean_chinese("我爱你，中国！China"))
data.loc[data['nl'] == 'ch', '1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme'] = data.loc[data['nl'] == 'ch', '1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme'].apply(lambda x: clean_chinese(x))

我爱你，中国！


In [44]:
vec = CountVectorizer(
    lowercase=True,  # 英文文本全小写
    analyzer='char_wb',  # 逐个字母解析
    ngram_range=(1, 3),  # 1=出现的字母以及每个字母出现的次数，2=出现的连续2个字母，和连续2个字母出现的次数
    # trump images are now... => 1gram=t,r,u,m,p... 2gram=tr,ru,um,mp...
    max_features=1000,   # keep the most common 1000 ngrams
    preprocessor = remove_noise
)
vec.fit(X_train)

CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 3),
        preprocessor=<function remove_noise at 0x1a182a2840>,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [47]:
from sklearn.naive_bayes import MultinomialNB
classifier2 = MultinomialNB()
classifier2.fit(vec.transform(X_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
classifier2.score(vec.transform(X_test), y_test)

0.9931869795609387

In [59]:
# 数据清洗之前的结果
print(classification_report(y_test, classifier.predict(vec.transform(X_test))))

              precision    recall  f1-score   support

          ch       1.00      0.98      0.99       329
          de       0.99      1.00      0.99       402
          en       0.98      0.99      0.98       365
          es       0.99      0.99      0.99       402
          fr       0.99      1.00      0.99       390
          it       0.99      0.99      0.99       384
          nl       0.99      0.99      0.99       370

   micro avg       0.99      0.99      0.99      2642
   macro avg       0.99      0.99      0.99      2642
weighted avg       0.99      0.99      0.99      2642



In [50]:
from sklearn.metrics import classification_report

# 数据清洗之后的效果
print(classification_report(y_test, classifier2.predict(vec.transform(X_test))))

              precision    recall  f1-score   support

          ch       1.00      1.00      1.00       329
          de       0.99      1.00      0.99       402
          en       0.99      0.99      0.99       365
          es       0.99      0.99      0.99       402
          fr       0.99      1.00      0.99       390
          it       0.99      0.99      0.99       384
          nl       0.99      0.99      0.99       370

   micro avg       0.99      0.99      0.99      2642
   macro avg       0.99      0.99      0.99      2642
weighted avg       0.99      0.99      0.99      2642

