In [1]:
#Bước 1: Khai báo thư viện
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from google.colab import drive
drive.mount('/content/drive')

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#Bước 2: Đọc dữ liệu
train = pd.read_csv('drive/My Drive/Dataset/train.csv')
test = pd.read_csv('drive/My Drive/Dataset/test.csv')
test_label = pd.read_csv('drive/My Drive/Dataset/test_labels.csv')

train_text = train['comment_text']
test_text = test['comment_text']

print(train_text.head()) 
print(test_text.head())

#Bước 3: Trích chọn đặc trưng
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english', 
    ngram_range=(1, 1),
    max_features=10000)

word_vectorizer.fit(train_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

X_train = train_word_features # bien doc lap cua tap train
X_test = test_word_features # bien doc lap cua tap test

y_train = train[class_names]  # bien phu thuoc cua tap train
y_test = test_label[class_names] # bien phu thuoc cua tap test

print(y_train)
print(y_test)

#Bước 4: Xây dựng model + đánh giá
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name] 
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, X_train, train_target, cv=3, scoring='roc_auc')) 
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(X_train, train_target)
    submission[class_name] = classifier.predict_proba(X_test)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

print(submission[class_names])

labels = []
for data in submission.values:
    # print(data[1:6])
    labels.append(class_names[np.argmax(data[1:6], axis=0)])

submission['label'] = labels
print(submission)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object
0    Yo bitch Ja Rule is more succesful then you'll...
1    == From RfC == \n\n The title is fine as it is...
2    " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3    :If you have a look back at the source, the in...
4            I don't anonymously edit articles at all.
Name: comment_text, dtype: object
        toxic  severe_toxic  obscene  threat  insult  identity_hate
0           0             0        0       0       0              0
1           0             0        0       0       0              0
2           0             0        0       0     