In [20]:
# set file path
filepath = '../data/'
trainfile = 'train.csv'

In [21]:
# read train.csv
import pandas as pd
df = pd.read_csv(filepath+trainfile)

In [23]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(
    df[['text']], df.target, stratify=df.target.values, random_state=0, test_size=0.1
)

In [4]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

stopwords = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

In [5]:
from collections import Counter
from itertools import chain, combinations
from tqdm import tqdm

class BitermConverter:
    
    def __init__(self, include_docs=False, min_df=1, max_df=1.0):
        self.biterms = Counter()
        self.include_docs = include_docs
        self.min_df = min_df
        self.max_df = max_df
        
    def fit_transform(self, docs):
        self.fit(docs)
        return self.transform(docs)
        
    def fit(self, docs):
        docs_add_biterm = []
        
        for doc in tqdm(docs):
            self.biterms.update(
                ['_'.join(sorted(biterm)) 
                 for biterm in combinations(doc.split(), 2) if biterm[0] != biterm[1]
                ]
            )
        
        sum_freq = sum(self.biterms.values())
        
        for b, f in list(self.biterms.items()):
            if f < self.min_df or f/sum_freq > self.max_df:
                del self.biterms[b]
    
    def transform(self, docs):
        docs_add_biterm = []
        
        if self.include_docs:
            get_doc = lambda doc, biterms: '{0} {1}'.format(doc, ' '.join(biterms))
        else:
            get_doc = lambda doc, biterms: ' '.join(biterms)
        
        
        for doc in tqdm(docs):
            biterms = [biterm for biterm in [
                '_'.join(sorted(biterm)) for biterm in combinations(doc.split(), 2) if biterm[0] != biterm[1]
            ] if biterm in self.biterms]
            
            docs_add_biterm.append(get_doc(doc, biterms))
            
        return docs_add_biterm

In [6]:
import re

def format_text(text):

    text=re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', '', text)
    text=re.sub('RT', '', text)
    text=re.sub(r'[!-/?:@]', '', text)#半角記号,数字,英字
    text=re.sub(r'[︰-＠]', '', text)#全角記号
    text=re.sub('\n', ' ', text)#改行文字

    return text

In [7]:
bc = BitermConverter(include_docs=True, min_df=10, max_df=1.0)

docs_train = [format_text(text) for text in df_train.text.values]
docs_train = [[word for word in word_tokenize(text) if word not in stopwords] for text in docs_train]
docs_train = [' '.join([wordnet_lemmatizer.lemmatize(word.lower()) for word in doc]) for doc in docs_train]
docs_train = bc.fit_transform(docs_train)

docs_test = [format_text(text) for text in df_test.text.values]
docs_test = [[word for word in word_tokenize(text) if word not in stopwords] for text in docs_test]
docs_test = [' '.join([wordnet_lemmatizer.lemmatize(word.lower()) for word in doc]) for doc in docs_test]
docs_test = bc.transform(docs_test)
len(bc.biterms)

100%|██████████| 6851/6851 [00:00<00:00, 12837.52it/s]
100%|██████████| 6851/6851 [00:00<00:00, 10153.48it/s]
100%|██████████| 762/762 [00:00<00:00, 13402.24it/s]


2384

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=5)
X_train = cv.fit_transform(docs_train)
X_test = cv.transform(docs_test)
X_train.shape, X_test.shape

((6851, 4713), (762, 4713))

In [12]:
X_train.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [17]:
df = pd.DataFrame(X_train.todense(), columns=['text_%s' % column for column in cv.get_feature_names()])
pd.concat([df_train.id, df, df_train.target], axis=1)

Unnamed: 0,text_05,text_05_august,text_05_issued,text_05_nw,text_05_severe,text_06,text_10,text_100,text_1000,text_11,...,text_ûªm,text_ûªs,text_ûªs_crash,text_ûªs_market,text_ûªs_stock,text_ûªt,text_ûªve,text_ûïwhen,text_ûò,text_ûó
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6848,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [225]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)
y_pred = naive_bayes.predict(X_test)
y_prob = naive_bayes.predict_proba(X_test)

In [226]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score

print('f1:{}'.format(f1_score(y_test, y_pred)))
print(confusion_matrix(y_test, y_pred))

f1:0.7557117750439368
[[408  27]
 [112 215]]


```
f1:0.7316017316017316
[[320  37]
[ 87 169]]
```

In [224]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.94      0.85       435
           1       0.89      0.66      0.76       327

    accuracy                           0.82       762
   macro avg       0.84      0.80      0.81       762
weighted avg       0.83      0.82      0.81       762



In [173]:
from joblib import dump

dump(bc, '../model/bc.pkl')
dump(cv, '../model/cv.pkl') 
dump(naive_bayes, '../model/nb.pkl') 

['../model/nb.pkl']