In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

#Read the data
train_df = pd.read_csv('train.csv', delimiter="\t")
test_df = pd.read_csv('test.csv', delimiter="\t")
test_label_df = pd.read_csv('sample_submission.csv')

In [3]:
new_train_df = train_df['text']
train_df['label']

0       1
1       1
2       0
3       0
4       0
       ..
4982    0
4983    0
4984    0
4985    0
4986    0
Name: label, Length: 4987, dtype: object

In [4]:
#Get shape and head
new_test_df = test_df['text']
test_df.head()

Unnamed: 0,id,text
0,2,The 2017 Teen Choice Awards ceremony was held ...
1,3,"The concert, part of “The Joshua Tree Tour,” w..."
2,4,Selena Gomez refuses to talk to her mother abo...
3,5,This is worse than a lump of coal in your stoc...
4,6,Luann De Lesseps is going to rehab after her a...


In [5]:
#Get shape and head
test_label_df.head()

Unnamed: 0,id,label
0,2,1
1,3,1
2,4,0
3,5,0
4,6,0


In [6]:
#去除停頓詞stop words
#使用sklearn內建的功能
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer_rmsw = CountVectorizer(stop_words='english')
# new_train_df = vectorizer_rmsw.fit_transform(train_df['text'])
# print(vectorizer_rmsw.get_feature_names())
#print(new_train_df)

In [7]:
# vectorizer_rmsw_2 = CountVectorizer(stop_words='english')
# new_test_df = vectorizer_rmsw_2.fit_transform(test_df['text'])
# print(vectorizer_rmsw_2.get_feature_names())

In [8]:
# Change the labels
train_df.loc[(train_df['label'] == 'label') , ['label']] = '0'
train_df['label'] = pd.to_numeric(train_df['label'])
train_label = train_df['label']
test_label_df['label'] = pd.to_numeric(test_label_df['label'])
test_label = test_label_df['label']
#去除停頓詞stop words
#文字探勘前處理，將文字轉換成向量，方法為tf-idf
#Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
#Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(new_train_df)
tfidf_test = tfidf_vectorizer.transform(new_test_df)
print(type(train_label[0]))

<class 'numpy.int64'>


In [9]:
import xgboost as xgb
import sklearn.metrics as metrics
tfidf_train_weight = tfidf_train.toarray()  
tfidf_test_weight = tfidf_test.toarray()
# import xgboost as xgb
# xgb_params = {'eta': 0.3, 
#               'max_depth': 5, 
#               'subsample': 0.8, 
#               'colsample_bytree': 0.8, 
#               'objective': 'binary:logistic', 
#               'eval_metric': 'auc', 
#               'seed': 23
#              }
# d_train = xgb.DMatrix(tfidf_train, label = train_label)
# d_test = xgb.DMatrix(tfidf_test, label = test_label)

# #xgboost模型構建
# watchlist = [(d_test, 'valid')]
# xgb_model = xgb.train(xgb_params, d_train, 200, watchlist, verbose_eval=False, early_stopping_rounds=30)

#基於Scikit-learn接口的分類
#訓練模型
model = xgb.XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=100, objective='binary:logistic')
model.fit(tfidf_train_weight, train_label)
y_predict = model.predict(tfidf_test_weight)

#模型預測
# y_predict = xgb_model.predict(d_test)  
confusion_matrix = metrics.confusion_matrix(test_label, y_predict)
df = pd.DataFrame(confusion_matrix)
print('準確率：', metrics.accuracy_score(test_label, y_predict))
print('confusion_matrix:', df)
print(metrics.classification_report(test_label, y_predict))

準確率： 0.5012028869286287
confusion_matrix:      0    1
0  437  193
1  429  188
              precision    recall  f1-score   support

           0       0.50      0.69      0.58       630
           1       0.49      0.30      0.38       617

    accuracy                           0.50      1247
   macro avg       0.50      0.50      0.48      1247
weighted avg       0.50      0.50      0.48      1247



In [10]:
!pip install lightgbm



In [11]:
import lightgbm as lgb
#創建成lgb特徵的數據集格式
lgb_train = lgb.Dataset(tfidf_train_weight, train_label)
lgb_test = lgb.Dataset(tfidf_test_weight, test_label, reference=lgb_train)
#建lightGBM模型
params = {'max_depth': 5, 'min_data_in_leaf': 20, 'num_leaves': 35,
          'learning_rate': 0.1, 'lambda_l1': 0.1, 'lambda_l2': 0.2,
          'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
#設置迭代次數，默認為100，通常設置為100+
num_boost_round = 1000
#訓練lightGBM模型
gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_test)
#預測
y_pred = gbm.predict(tfidf_test_weight, num_iteration=gbm.best_iteration)
y_predict = np.argmax(y_pred, axis=1)  # 獲得最大概率對應的標籤
confusion_matrix = metrics.confusion_matrix(test_label, y_predict)
df = pd.DataFrame(confusion_matrix)
print('準確率：', metrics.accuracy_score(test_label, y_predict))
print(df)
print(metrics.classification_report(test_label, y_predict))

[100]	valid_0's multi_logloss: 0.978093
[200]	valid_0's multi_logloss: 1.08498
[300]	valid_0's multi_logloss: 1.16466
[400]	valid_0's multi_logloss: 1.24869
[500]	valid_0's multi_logloss: 1.32035
[600]	valid_0's multi_logloss: 1.39599
[700]	valid_0's multi_logloss: 1.46823
[800]	valid_0's multi_logloss: 1.53637
[900]	valid_0's multi_logloss: 1.60088
[1000]	valid_0's multi_logloss: 1.66687
準確率： 0.4963913392141139
     0    1
0  409  221
1  407  210
              precision    recall  f1-score   support

           0       0.50      0.65      0.57       630
           1       0.49      0.34      0.40       617

    accuracy                           0.50      1247
   macro avg       0.49      0.49      0.48      1247
weighted avg       0.49      0.50      0.48      1247



In [12]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
      max_depth=1, random_state=0)
clf.fit(tfidf_train_weight, train_label)
clf.predict(tfidf_test_weight)
clf.score(tfidf_test_weight, test_label)
confusion_matrix = metrics.confusion_matrix(test_label, y_predict)
df = pd.DataFrame(confusion_matrix)
print('準確率：', clf.score(tfidf_test_weight, test_label))
print(df)
print(metrics.classification_report(test_label, y_predict))

準確率： 0.49478748997594224
     0    1
0  409  221
1  407  210
              precision    recall  f1-score   support

           0       0.50      0.65      0.57       630
           1       0.49      0.34      0.40       617

    accuracy                           0.50      1247
   macro avg       0.49      0.49      0.48      1247
weighted avg       0.49      0.50      0.48      1247

