# ライブラリの読み込み

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
from collections import Counter
import ipadic
import lightgbm as lgb
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import MeCab
import spacy

# データの読み込み

In [2]:
DATA_DIR = "../data/"

train_raw = pd.read_csv(f"{DATA_DIR}raw/train.csv")
train = train_raw.copy()
print(train.shape)
train.head(3)

(5256, 4)


Unnamed: 0,id,source,text,label
0,80074aa43,news4vip,まともに相手されてない人との関係なんて\nそんな大事にするものか？,0
1,6378fea6b,livejupiter,最近はアヘアヘQSマンやない？ ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0
2,c535f5613,livejupiter,日本人として生まれても無能な低学歴って分かったら日本人の権利剥奪して追放すべきやろ\n甘えるな,1


In [3]:
test_raw = pd.read_csv(f"{DATA_DIR}raw/test.csv")
test = test_raw.copy()
print(test.shape)
test.head(3)

(3223, 3)


Unnamed: 0,id,source,text
0,001026808,news4vip,上でも言ったけどオタクレベルの知識求めてる訳じゃない\nただ囲碁やります！って人が誰1人プロ...
1,00465ac96,livejupiter,たとえば、黒人なんかは、生物学的欠陥はないのに、文化的要因で、悪循環に陥り、実力をつけられず...
2,004674725,livejupiter,そうなんやろなあ色々と勿体ない感じしたわ\n終わり方と黒幕キャラは好きやったで\n\nちなワ...


# LightGBMの実装

## 形態素解析（MeCab）

In [4]:
df = pd.concat([train, test], axis=0)

# インデックスを振り直し
df = df.reset_index(drop=True)

In [5]:
print(df.shape)
df.head(3)

(8479, 4)


Unnamed: 0,id,source,text,label
0,80074aa43,news4vip,まともに相手されてない人との関係なんて\nそんな大事にするものか？,0.0
1,6378fea6b,livejupiter,最近はアヘアヘQSマンやない？ ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0.0
2,c535f5613,livejupiter,日本人として生まれても無能な低学歴って分かったら日本人の権利剥奪して追放すべきやろ\n甘えるな,1.0


In [6]:
df = df[["id", "text", "label"]].copy()

In [7]:
m = MeCab.Tagger("")
text_list = []
length_list = []

# それぞれの文書を取り出して形態素解析
for sentence in df["text"]:
    ma = m.parse(sentence)
    
    word_list = []
    
    # 形態素解析後の単語だけ抽出
    for text in ma.split("\n"):
        word_list.append(text.split("\t")[0])
    
    # 単語の数を集計
    length_list.append(len(word_list))
    
    # 単語の頻度を集計
    data = Counter(word_list)
    text_data = pd.DataFrame.from_dict(data, orient="index")
    text_list.append(text_data)

In [8]:
feature = pd.concat(text_list, axis=1)
#Nanを0に置換
feature = feature.fillna(0)
#Arrayに変換
feature_temp = feature.values.sum(axis=1)
#上位k件
K = 10000
#上位k件のインデックス
indices = np.argpartition(-feature_temp, K)[:K]

In [9]:
## 各文書に対して全体で頻出の上位k個の単語の出現数をその文書の単語出現数で割ったものを変数とする ##
modi_feature = []
for index, row in feature.iloc[indices].T.reset_index(drop=True).iterrows():
    modi_feature_temp = row/length_list[index]
    modi_feature.append(modi_feature_temp)

modi_feature = pd.concat(modi_feature, axis=1).T

# 各文書と作成した特徴量を結合
df = pd.concat([df, modi_feature], axis=1)

print(df.shape)
df.head(3)

(8479, 10003)


Unnamed: 0,id,text,label,憧れ,所々,相手,さ,れ,て,ない,...,ドアホ,撫で,現金,ﾜﾝｺ,あはれ,傷つけ,ははは,寝取ら,抑えれ,開花
0,80074aa43,まともに相手されてない人との関係なんて\nそんな大事にするものか？,0.0,0.0,0.0,0.047619,0.047619,0.047619,0.047619,0.047619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6378fea6b,最近はアヘアヘQSマンやない？ ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,c535f5613,日本人として生まれても無能な低学歴って分かったら日本人の権利剥奪して追放すべきやろ\n甘えるな,1.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
## Light gbm実装
df = df.drop(["id", "text"], axis=1)

df_train = df[df["label"].notnull()]
df_test = df[df["label"].isnull()]

df_train, df_val = train_test_split(df_train, test_size=0.2)

In [11]:
col = "label"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

test_x = df_test.drop(col, axis=1)

trains = lgb.Dataset(train_x.values, train_y)
valids = lgb.Dataset(val_x.values, val_y)

params = {
    "objective": "binary",
    "metrics": "binary_logloss"
}

model = lgb.train(params, trains, valid_sets=valids, num_boost_round=1000, early_stopping_rounds=100)



[LightGBM] [Info] Number of positive: 239, number of negative: 3965
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11046
[LightGBM] [Info] Number of data points in the train set: 4204, number of used features: 524
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.056851 -> initscore=-2.808798
[LightGBM] [Info] Start training from score -2.808798
[1]	valid_0's binary_logloss: 0.214636
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.201749
[3]	valid_0's binary_logloss: 0.194393
[4]	valid_0's binary_logloss: 0.189882
[5]	valid_0's binary_logloss: 0.185315
[6]	valid_0's binary_logloss: 0.182159
[7]	valid_0's binary_logloss: 0.179197
[8]	valid_0's binary_logloss: 0.176613
[9]	valid_0's binary_logloss: 0.175095
[10]	valid_0's binary_logloss: 0.173087
[11]	valid_0's binary_logloss: 0.170874
[12]	valid_0's binary_logloss: 0.169434
[13]	valid_0's 

# 予測の出力

In [12]:
##　予測：量的変数で返ってきているので0,1に振り分ける
threshold = 0.3

predict_list = []
for i in model.predict(val_x):
    if i > threshold:
        predict = 1
    else:
        predict = 0
    predict_list.append(predict)

f1_score(val_y, predict_list)

0.36363636363636365

In [13]:
# テストデータを予測する
y_pred = model.predict(test_x)

In [14]:
test["label_prob"] = y_pred

"""label付けを行う"""
test["label"] = 0
test.loc[test["label_prob"] > threshold, "label"] = 1

In [15]:
print(test[test["label"]==1].shape)
test[test["label"]==1]

(139, 5)


Unnamed: 0,id,source,text,label_prob,label
49,03266b65b,news4vip,そういう意味か\n女は平常運転が赤ちゃんプレイなんだなｗ,0.316671,1
54,03bc16d7d,livejupiter,お前ら朝鮮人はそういう揚げ足だけは得意だよなwwwwwwww自動変換で出ただけなんyが？え？...,0.331348,1
60,0414e8da6,news4vip,普通の日本人ならこう考えて当たり前なのに\nそういうこと言うあたりお前は在日朝鮮人か\nさっ...,0.437930,1
63,04310a7fb,newsplus,生協の食い物って\n韓国人の大腸菌入ってそうで食いたくないわ,0.525995,1
71,04962b6ef,livejupiter,韓国政府をギリシャ人辺りに任せよう\nあいつら暇やろ,0.338776,1
...,...,...,...,...,...
3054,f126bc335,newsplus,ノ\n\n5チョンにもちまちま周知活動するよ！,0.462347,1
3094,f50a641d4,news4vip,そりゃお前のような在日朝鮮人なら通じるだろうが、\nオレみたいなまともな日本人は朝鮮人などと...,0.529762,1
3101,f56c7a1dd,newsplus,まっとうな朝鮮人ならとっくに帰化済み。\n\n在日をやっているアホは、北朝鮮に叩き返すべき。,0.777657,1
3115,f695d1ea4,newsplus,在特会はここでさっさと在日チョンが立ちのくまでしつこくデモしろよ,0.488783,1


In [16]:
# 予測結果をcsvで出力
df_submit = test[["id", "label"]]
df_submit.to_csv(
    '../data/LightGBM/submit_LightGBM_03.csv',
    header=True, 
    index=False
    )