### 050 データの入手・整形 

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
publisher_opt = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Dail Mail']
index = ['id', 'title', 'url', 'publisher', 'category', 'story', 'hostname', 'timestamp']

df = pd.read_csv('../news_aggregator_data_set/newsCorpora.csv', sep='\t', names=index)
df = df[df['publisher'].isin(publisher_opt)]

train_x, test_x, train_y, test_y = train_test_split(df['title'], df['category'], test_size=0.2, random_state=1)
test_x, valid_x, test_y, valid_y = train_test_split(test_x, test_y, test_size=0.5, random_state=1)

train = pd.concat([train_x, train_y], axis=1)
test = pd.concat([test_x, test_y], axis=1)
valid = pd.concat([valid_x, valid_y], axis=1)

print('------------train------------')
print('business               :', (train['category'] == 'b').sum())
print('science and technology :', (train['category'] == 't').sum())
print('entertainment          :', (train['category'] == 'e').sum())
print('health                 :', (train['category'] == 'm').sum())
print('------------test------------')
print('business               :', (test['category'] == 'b').sum())
print('science and technology :', (test['category'] == 't').sum())
print('entertainment          :', (test['category'] == 'e').sum())
print('health                 :', (test['category'] == 'm').sum())
print('------------valid------------')
print('business               :', (valid['category'] == 'b').sum())
print('science and technology :', (valid['category'] == 't').sum())
print('entertainment          :', (valid['category'] == 'e').sum())
print('health                 :', (valid['category'] == 'm').sum())

------------train------------
business               : 4343
science and technology : 979
entertainment          : 3028
health                 : 518
------------test------------
business               : 547
science and technology : 106
entertainment          : 379
health                 : 77
------------valid------------
business               : 533
science and technology : 121
entertainment          : 398
health                 : 57


In [3]:
train.to_csv('../news_aggregator_data_set/train.txt', sep='\t')
test.to_csv('../news_aggregator_data_set/test.txt', sep='\t')
valid.to_csv('../news_aggregator_data_set/valid.txt', sep='\t')

### 051 特徴量抽出

In [4]:
train = pd.read_csv('../news_aggregator_data_set/train.txt', sep='\t', index_col=0)
test = pd.read_csv('../news_aggregator_data_set/test.txt', sep='\t', index_col=0)
valid = pd.read_csv('../news_aggregator_data_set/valid.txt', sep='\t', index_col=0)

In [5]:
import re
import string

def preprocessing(text: str):
  table = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
  text = text.translate(table)
  text = text.lower()
  text = re.sub(r'[0-9]+', '0', text)
  return text

train['title'] = train['title'].map(lambda x: preprocessing(x))
test['title'] = test['title'].map(lambda x: preprocessing(x))
valid['title'] = valid['title'].map(lambda x: preprocessing(x))

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(train['title'])
test_x = vectorizer.transform(test['title'])
valid_x = vectorizer.transform(valid['title'])

In [7]:
train_x = pd.DataFrame(train_x.toarray(), columns=vectorizer.get_feature_names_out())
test_x = pd.DataFrame(test_x.toarray(), columns=vectorizer.get_feature_names_out())
valid_x = pd.DataFrame(valid_x.toarray(), columns=vectorizer.get_feature_names_out())
train_x

Unnamed: 0,0b,0bn,0c,0ct,0d,0f,0ff,0g,0headlines,0k,...,zoe,zombie,zombies,zone,zoosk,zpfa0mqti0qdrpfhqwjm,zynga,œf,œpiece,œwaist
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.446746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# train_x.to_csv('../news_aggregator_data_set/train.feature.txt', sep='\t')
# test_x.to_csv('../news_aggregator_data_set/test.feature.txt', sep='\t')
# valid_x.to_csv('../news_aggregator_data_set/valid.feature.txt', sep='\t')

### 052 学習

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=1, max_iter=10000, n_jobs=-1)
model.fit(train_x, train_y)

### 053 予測

In [10]:
pred_y = model.predict(test_x)
test_y = test_y.tolist()

In [11]:
pd.DataFrame(model.predict_proba(test_x), columns=['b', 't', 'e', 'm'])

Unnamed: 0,b,t,e,m
0,0.936233,0.032987,0.011396,0.019384
1,0.887219,0.055766,0.024222,0.032792
2,0.092854,0.044273,0.025818,0.837055
3,0.825352,0.087370,0.037802,0.049476
4,0.739755,0.157955,0.037154,0.065136
...,...,...,...,...
1104,0.436474,0.200921,0.151126,0.211479
1105,0.835351,0.035786,0.031626,0.097236
1106,0.140185,0.776219,0.035115,0.048481
1107,0.384817,0.391132,0.123075,0.100976


### 054 正解率の計測

In [12]:
from sklearn.metrics import accuracy_score

print('accuracy :', accuracy_score(test_y, pred_y))

accuracy : 0.8890892696122633


### 055 混同行列の作成

In [13]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_y, pred_y)

array([[527,  17,   0,   3],
       [  4, 374,   0,   1],
       [ 21,  19,  35,   2],
       [ 39,  17,   0,  50]])

### 056 適合率、再現率、F1スコアの計測

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

labels = ['b', 't', 'e', 'm']

precision = precision_score(test_y, pred_y, labels=labels, average=None)
precision = np.append(precision, precision_score(test_y, pred_y, average='micro'))
precision = np.append(precision, precision_score(test_y, pred_y, average='macro'))

recall = recall_score(test_y, pred_y, labels=labels, average=None)
recall = np.append(recall, recall_score(test_y, pred_y, average='micro'))
recall = np.append(recall, recall_score(test_y, pred_y, average='macro'))

f1 = f1_score(test_y, pred_y, labels=labels, average=None)
f1 = np.append(f1, f1_score(test_y, pred_y, average='micro'))
f1 = np.append(f1, f1_score(test_y, pred_y, average='macro'))

pd.DataFrame({
  'precision': precision,
  'recall': recall,
  'f1': f1,
  }, index=['b', 't', 'e', 'm', 'micro average', 'macro avarage'])

Unnamed: 0,precision,recall,f1
b,0.891709,0.963437,0.926186
t,0.892857,0.471698,0.617284
e,0.875878,0.986807,0.92804
m,1.0,0.454545,0.625
micro average,0.889089,0.889089,0.889089
macro avarage,0.915111,0.719122,0.774127


### 057 特徴量の重みの確認

In [16]:
features = train_x.columns.values
index = [_ for _ in range(1,11)]

print(model.classes_)
display(pd.DataFrame(model.coef_))

for cla, coef in zip(model.classes_, model.coef_):
  print('category:',cla)
  best10 = pd.DataFrame(features[np.argsort(coef)[::-1][:10]], columns=['best'], index=index).T
  worst10 = pd.DataFrame(features[np.argsort(coef)[:10]], columns=['worst'], index=index).T
  result = pd.concat([best10, worst10], axis=0)
  display(result)

['b' 'e' 'm' 't']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10785,10786,10787,10788,10789,10790,10791,10792,10793,10794
0,0.3947,0.064382,-0.167349,0.097339,-0.447139,-0.059385,-0.028957,0.093696,-0.002064,0.169784,...,-0.101134,0.032333,0.097625,0.623645,0.148901,-0.008256,0.234707,-0.066414,-0.112081,-0.014555
1,-0.224573,-0.030451,-0.169893,-0.038611,-0.118305,-0.171528,0.058964,-0.013523,0.003911,0.012352,...,0.172249,-0.013648,-0.039883,-0.422255,-0.085794,0.015646,-0.129379,0.113072,0.15027,0.033705
2,-0.07448,-0.012451,-0.064189,-0.015689,0.136299,-0.027443,-0.015744,-0.011861,-0.000551,-0.065028,...,-0.027624,-0.008104,-0.011847,-0.167495,-0.020854,-0.002204,-0.038702,-0.016693,-0.014698,-0.00692
3,-0.095647,-0.02148,0.401431,-0.043039,0.429145,0.258356,-0.014263,-0.068312,-0.001296,-0.117109,...,-0.043491,-0.01058,-0.045895,-0.033895,-0.042253,-0.005186,-0.066626,-0.029965,-0.023491,-0.012231


category: b


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
best,fed,china,ecb,bank,stocks,euro,update,as,profit,oil
worst,study,ebola,the,video,apple,google,and,star,microsoft,facebook


category: e


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
best,chris,kardashian,miley,star,cyrus,paul,film,kim,movie,thrones
worst,update,us,google,says,china,gm,facebook,apple,study,ceo


category: m


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
best,ebola,study,drug,cancer,fda,mers,health,cdc,outbreak,virus
worst,gm,at,ceo,facebook,china,apple,fed,amazon,deal,as


category: t


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
best,google,apple,facebook,microsoft,climate,gm,tesla,mobile,fcc,comcast
worst,stocks,fed,drug,day,ecb,cancer,percent,bank,shares,obamacare
