## Data preparation

In [1]:
# categories
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [2]:
# obtain the documents containing the categories provided
from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train', categories=categories,
                                  shuffle=True, random_state=42)

In [3]:
import pandas as pd

# my functions
import helpers.data_mining_helpers as dmh

# construct dataframe from a list
X = pd.DataFrame.from_records(dmh.format_rows(twenty_train), columns= ['text'])

In [4]:
X['category'] = twenty_train.target

In [5]:
X

Unnamed: 0,text,category
0,From: sd345@city.ac.uk (Michael Collier) Subje...,1
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar) ...,1
2,From: djohnson@cs.ucsd.edu (Darin Johnson) Sub...,3
3,From: s0612596@let.rug.nl (M.M. Zwart) Subject...,3
4,From: stanly@grok11.columbiasc.ncr.com (stanly...,3
...,...,...
2252,From: roos@Operoni.Helsinki.FI (Christophe Roo...,2
2253,From: mhollowa@ic.sunysb.edu (Michael Holloway...,2
2254,From: sasghm@theseus.unx.sas.com (Gary Merrill...,2
2255,From: Dan Wallach <dwallach@cs.berkeley.edu> S...,2


### 1.

在創建新label _"category_name"_ 的時候，我認為不用特別把這個寫成一個function，只需要用list去做index批配就好。

同時用在其他dataset時，也不會dmh寫死target_names而不能用。

In [6]:
# origin:
# X['category_name'] = X.category.apply(lambda t: dmh.format_labels(t, twenty_train))

# after:
# 直接將index寫成for迴圈，增加可讀性
X['category_name'] = [categories[i] for i in list(X['category'])]
X

Unnamed: 0,text,category,category_name
0,From: sd345@city.ac.uk (Michael Collier) Subje...,1,soc.religion.christian
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar) ...,1,soc.religion.christian
2,From: djohnson@cs.ucsd.edu (Darin Johnson) Sub...,3,sci.med
3,From: s0612596@let.rug.nl (M.M. Zwart) Subject...,3,sci.med
4,From: stanly@grok11.columbiasc.ncr.com (stanly...,3,sci.med
...,...,...,...
2252,From: roos@Operoni.Helsinki.FI (Christophe Roo...,2,comp.graphics
2253,From: mhollowa@ic.sunysb.edu (Michael Holloway...,2,comp.graphics
2254,From: sasghm@theseus.unx.sas.com (Gary Merrill...,2,comp.graphics
2255,From: Dan Wallach <dwallach@cs.berkeley.edu> S...,2,comp.graphics


### 2.
在算有多少個null時，不用跑for迴圈去一一檢查，可以直接用內建函式算出。

In [7]:
# origin:
# X.isnull().apply(lambda x: dmh.check_missing_values(x))

# def check_missing_values(row):
#     """ functions that check and verifies if there are missing values in dataframe """
#     counter = 0
#     for element in row:
#         if element == True:
#             counter+=1
#     return ("The amoung of missing records is: ", counter)

# after:
X.isnull().sum()

text             0
category         0
category_name    0
dtype: int64

### 3.
如果可以介紹drop_duplicates裡面不同參數的意義會更好

drop_duplicates(subset = None, keep = 'first', inplace = False)

subset: 設定只看哪一個column

keep可以有3個值: 

_First_: 只視第一個為唯一值

_Last_:  只視最後一個為唯一值 

_False_: 將所有相同的都當作複製值
      
inplace: 為True，完整地刪除那一行

### 4.
我們可以透過以下轉換，找出某個doc中，有出現過得字詞

In [8]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X.text) #learn the vocabulary and return document-term matrix

In [9]:
# 在doc0裡，先轉換成 1d-array，檢查有哪些是非0項

indexes = np.nonzero(X_counts[0].toarray()[0])
indexes = indexes[0].tolist()
indexes

[177,
 230,
 587,
 2326,
 3062,
 3166,
 4017,
 4378,
 4808,
 5195,
 5201,
 5285,
 8696,
 9031,
 9338,
 9801,
 9805,
 9932,
 12014,
 12051,
 12541,
 12833,
 14085,
 14281,
 14676,
 14887,
 15576,
 15837,
 16082,
 16881,
 16916,
 16927,
 17302,
 17366,
 17389,
 17556,
 18268,
 18474,
 19458,
 19780,
 20198,
 20253,
 20459,
 21661,
 23122,
 23610,
 23915,
 24651,
 24677,
 25337,
 25361,
 25663,
 26175,
 27836,
 28619,
 29022,
 30623,
 31077,
 31915,
 32116,
 32135,
 32142,
 32270,
 32391,
 32493,
 33256,
 33572,
 33597,
 33915,
 34755,
 34775,
 35312,
 35416]

In [10]:
# 將非0項的代表字詞輸出

[count_vect.get_feature_names()[i] for i in indexes]

['071',
 '0hb',
 '14',
 '477',
 '8000',
 '8565',
 'ac',
 'advance',
 'also',
 'any',
 'anyone',
 'application',
 'city',
 'collier',
 'computer',
 'convert',
 'converting',
 'correct',
 'do',
 'does',
 'ec1v',
 'email',
 'fax',
 'files',
 'format',
 'from',
 'good',
 'group',
 'hampton',
 'host',
 'hp',
 'hpgl',
 'iii',
 'images',
 'img',
 'in',
 'into',
 'is',
 'know',
 'laserjet',
 'like',
 'lines',
 'london',
 'michael',
 'nntp',
 'of',
 'organization',
 'pc',
 'pd',
 'please',
 'plotter',
 'posting',
 'programmer',
 'response',
 'same',
 'sd345',
 'standard',
 'subject',
 'tel',
 'tga',
 'thanks',
 'the',
 'this',
 'tif',
 'to',
 'uk',
 'unit',
 'university',
 'utility',
 'way',
 'we',
 'would',
 'x3769']