In [32]:
import numpy as np
import pandas as pd
import sklearn.naive_bayes
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_validate, GridSearchCV
from IPython.core.interactiveshell import InteractiveShell

# Data processing

## Function of get data

In [8]:
# get the result of word segmentation
def get_data(file):
    from process import getting_data, getting_words
    tj = getting_words(filename = file)
    seg, index = getting_data(file, tj)
    df = pd.read_excel(file)
    y = df['class'].values
    y = np.delete(y, index)
    return seg, y

## Function of tf-idf transform

In [9]:
# calculate tf-idf weight
def data_process(seg):
    from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
    # transform the data structure of word segmentation 
    corpus = []
    for text in seg:
        corpus.append(' '.join(text))

    # convert the collection of words to a matrix of token counts, a[i][j] means the counts of word j in text i
    tf_vectorizer = CountVectorizer()
    
    # fit_transform turn the texts into text-word matrix
    tf_matrix = tf_vectorizer.fit_transform(corpus)
    
    # transform the matrix to a normalized tf-idf representation 
    tfidf_transformer = TfidfTransformer()
    
    # compute the tf-idf matrix
    tfidf_matrix = tfidf_transformer.fit_transform(tf_matrix)
    
    # get all words
    word_list = tf_vectorizer.get_feature_names_out()
    
    # get the tf-idf weight of each word in all text, a[i][j] means the tf-idf weight of word j in text i
    tfidf_weight = tfidf_matrix.toarray()

    return word_list, tfidf_weight, tfidf_matrix, tf_vectorizer

## Split data

In [10]:
# split data into train and test sets
def split_data(X,y,test_per = 0.2):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_per, random_state=42)
    return X_train, X_test, y_train, y_test

## Processing

In [11]:
seg, y = get_data('weibo_test.xlsx')
seg[:5], y[:5]

([['深圳', '疫情', '深圳'],
  ['深圳', '疫情', '提醒'],
  ['第一', '国家', '物质', '深圳', '疫情'],
  ['躲避', '深圳', '疫情', '结果', '自己', '上头', '市里'],
  ['深圳', '疫情', '停职', '绝望']],
 array([0, 0, 0, 1, 1], dtype=int64))

In [12]:
word_list, X, tfidf_matrix, tf_vectorizer = data_process(seg)
X_train, X_test, y_train, y_test = split_data(X,y)
X_train[:5,:]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Algorithm comparison

* SVM and Gradient Boosting have the lowest testing accuracy.

## Naive Bayes

In [15]:
# fit GaussianNB
def GNB(X_train, X_test, y_train, y_test):
    gnb = sklearn.naive_bayes.GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    score_train = gnb.score(X_train, y_train)
    score_test = gnb.score(X_test, y_test)

    return score_train,score_test, y_pred

In [16]:
# fit CategoricalNB
def CNB(X_train, X_test, y_train, y_test):
    cnb = sklearn.naive_bayes.CategoricalNB()
    cnb.fit(X_train, y_train)
    y_pred = cnb.predict(X_test)
    score_train = cnb.score(X_train, y_train)
    score_test = cnb.score(X_test, y_test)

    return score_train,score_test, y_pred

In [17]:
# fit MultinomiaNB
def MNB(X_train, X_test, y_train, y_test):
    mnb = sklearn.naive_bayes.MultinomialNB()
    mnb.fit(X_train, y_train)
    y_pred = mnb.predict(X_test)
    score_train = mnb.score(X_train, y_train)
    score_test = mnb.score(X_test, y_test)

    return score_train,score_test, y_pred

## SVM

In [18]:
# different kernel
def SVM(X_train, X_test, y_train, y_test, k='rbf', c=None):
    svm = SVC(kernel=k, class_weight=c)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    score_train = svm.score(X_train, y_train)
    score_test = svm.score(X_test, y_test)
    return score_train, score_test, y_pred

## Boosting

In [19]:
# XGBoost
def XGB(X_train, X_test, y_train, y_test):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    xgb_one = xgb.XGBClassifier(learning_rate=0.5, n_estimators=10, random_state=0)
    xgb_one.fit(X_train, y_train)
    pred = xgb_one.predict(X_test)
    return pred


In [20]:
# AdaBoost
def ada(X_train, X_test, y_train, y_test):
    mod = AdaBoostClassifier(random_state=0)
    mod.fit(X_train, y_train)
    pred = mod.predict(X_test)
    score = mod.score(X_test, y_test)
    return pred, score

In [21]:
# Gradient Boosting
def gb(X_train, X_test, y_train, y_test):
    mod = GradientBoostingClassifier(n_estimators=100, learning_rate=0.2, random_state=0)
    scores = cross_validate(mod, X_train, y_train, cv=5, scoring=('accuracy','recall'), return_train_score=True)
    #mod.fit(X_train, y_train)
    #train_score = mod.score(X_train, y_train)
    #test_score = mod.score(X_test, y_test)
    return scores['train_accuracy'], scores['test_accuracy'], scores['train_recall'], scores['test_recall'], scores['fit_time']

## Algorithm comparison

In [27]:
# Naive Bayes
gnb_train, gnb_test, gnb_y_pred = GNB(X_train, X_test, y_train, y_test)
cnb_train, cnb_test, cnb_y_pred = CNB(X_train, X_test, y_train, y_test)
mnb_train, mnb_test, mnb_y_pred = MNB(X_train, X_test, y_train, y_test)

print("The Gaussian NB test accuracy is: %.4f" % (gnb_test))
print("The Categorical NB test accuracy is: %.4f" % (cnb_test))
print("The Multinomial NB test accuracy is: %.4f" % (mnb_test))

The Gaussian NB test accuracy is: 0.6413
The Categorical NB test accuracy is: 0.5870
The Multinomial NB test accuracy is: 0.6196


In [28]:
# SVM
svm_train, svm_test, svm_y_pred = SVM(X_train, X_test, y_train, y_test, k='poly', c='balanced')
print("Using poly kernel, the SVM test accuracy is: %.4f" % (svm_test))

Using poly kernel, the SVM test accuracy is: 0.6630


In [31]:
# Boosting
# XGBoost
bo_pred = XGB(X_train, X_test, y_train, y_test)
xgb_acc = accuracy_score(y_test, bo_pred)
# AdaBoost
pred, ada_acc = ada(X_train, X_test, y_train, y_test)
# Gradient Boosting
gb_train, gb_acc, train_recall, test_recall, time = gb(X_train, X_test, y_train, y_test)

print("XGBoost accuracy is: %.4f" % (xgb_acc))
print("AdaBoost accuracy is: %.4f" % (ada_acc))
print("Gradient Boosting accuracy is: %.4f" % (np.average(gb_acc)))

XGBoost accuracy is: 0.6196
AdaBoost accuracy is: 0.6087
Gradient Boost accuracy is: 0.6458


# Best Model selection

## Gradient Boosting

In [38]:
# Gradient Boosting Model Selection
param_gird = {'n_estimators':[10,50,100,150],'learning_rate':[0.1,0.2,0.5,0.7,1.0]}
gb_model = GradientBoostingClassifier(random_state=0)
gbcv = GridSearchCV(gb_model, param_gird, cv=5, scoring=('accuracy', 'recall'), refit='accuracy', return_train_score=True)
gbcv.fit(X_train, y_train)
y_pred = gbcv.best_estimator_.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("The best parameters: ", gbcv.best_params_)
print("The best test accuracy is: %.4f" % acc)

The best parameters:  {'learning_rate': 0.2, 'n_estimators': 50}
The best test accuracy is: 0.6413
