In [37]:
#spamデータ分類
import os
import pandas as pd
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT="http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL=DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL=DOWNLOAD_ROOT+ "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets","spam")

def fetch_spam_data(spam_url=SPAM_URL,spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2"),HAM_URL),("spam.tar.bz2", SPAM_URL):#()分割
        path=os.path.join(spam_path,filename)
        if not os.path.isfile(path):        
            urllib.request.urlretrieve(url,path)#url指定、保存先の名前指定
        tar_bz2_file=tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()
        
fetch_spam_data()#これ呼び出してファイル取得

HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR= os.path.join(SPAM_PATH,"spam")
ham_filenames=[name for name in sorted(os.listdir(HAM_DIR))
              if len(name)>20] #名前の長さが20以上のファイルを抽出
spam_filenames=[name for name in sorted(os.listdir(SPAM_DIR))
               if len(name)>20]
import email
import email.policy

#電子メールのメッセージヘッダのためのインターネット規定のフォーマット.RFC　822をパースする　emailモジュールを用いる
def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory="spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename),"rb")as f: ##with open ~:で使う
        return email.parser.BytesParser(policy=email.policy.default).parse(f)
    
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]    
spam_emails=[load_email(is_spam=True, filename=name) for name in spam_filenames]

#print(ham_emails[1].get_content().strip())#テキストファイル読み込み、strip()は両端の空白、改行を取り除く
#print(spam_emails[6].get_content().strip())

def get_email_structure(email):
    if isinstance(email,str):#型判定
        return email
    payload=email.get_payload()#現在のペイロードへの参照を返す。
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
                    ]))
    else:
        return email.get_content_type()#Return the message’s content type, coerced to lower case of the form maintype/subtype.
    
from collections import Counter

def structures_counter(emails):
    structures=Counter()
    for email in emails:
        structure=get_email_structure(email)
        structures[structure]+=1
    return structures

structures_counter(ham_emails).most_common()
structures_counter(spam_emails).most_common()

#for header, value in spam_emails[0].items():
    #print(header,":",value )
    
spam_emails[0]["Subject"]

import numpy as np
from sklearn.model_selection import train_test_split #シャッフル機能つき　デフォルトTrue
X=np.array(ham_emails+spam_emails)
y=np.array([0]*len(ham_emails)+[1]*len(spam_emails))
X_train,  X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42) #分類の分割では、この順番に出力

  
import re
from html import unescape

def html_to_plain_text(html): #*?直前の文字が₀回以上繰り返す　最短一致
    text=re.sub("<head.*?>.*?</head>", "",str(html), flags=re.M | re.S |re.I)#re.A (ASCII 限定マッチング)、 re.I (大文字・小文字を区別しない)、
    text=re.sub("<a\s.*?>","HYPERRINK",text, flags=re.M | re.S | re.I)                    #re.L (ロケール依存)、 re.M (複数行)、 re.S (ドットが全てにマッチ)、 re.U (Unicode マッチング)、 re.X (冗長)
    text=re.sub("<.*?>","",text,flags=re.M | re.S)
    text=re.sub(r"(\s*\n)+", "\n", text, flags=re.M | re.S)
    return unescape(text)

    
html_spam_emails = [email for email in X_train[y_train ==1 ] if get_email_structure(email)=="text/html"]
sample_html_spam=html_spam_emails[7]
#print(sample_html_spam.get_content().strip()[:1000], "...")
#email　html形式をテキスト形式に変更した

def email_to_text(email):
    html=None
    for part in email.walk():
        ctype=part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue #上記に対応するものはスキップする
        try:
            content = part.get_content()
        except: 
            content=str(part.get_payload())
        if ctype =="text/plain":
            return content
        else:
            html=content
    if html:
        return html_to_plain_text(html)
    html_to_plain_text(email)
    
try:
    import urlextract
    
    url_extractor=urlextract.URLExtract()
    urls=extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s")
except ImportError:
    print("error")    

try:
    import nltk
       
    stemmer=nltk.stem.PorterStemmer()   
except ImportError:
    print("error")

from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True,remove_punctuation=True,replace_urls=True,replace_numbers=True,stemming=True):
        self.strip_headers=strip_headers
        self.lower_case=lower_case
        self.remove_punctuation=remove_punctuation
        self.replace_urls=replace_urls
        self.replace_numbers=replace_numbers
        self.stemming=stemming
    def fit(self, X,y=None):
        return self
    def transform(self,X,y=None):
        X_transformed=[]
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text=text.lower()
            if self.replace_urls and url_extractor is not None:
                extractor=urlextract.URLExtract()
                urls=list(set(extractor.find_urls(text)))##重複をキャンセル
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text=text.replace(url," URL ")
            if self.replace_numbers:
                text=re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?',"NUMBER",text)
            if self.remove_punctuation:
                text = re.sub(r'\W+',' ',text,flags=re.M) #\Wで\w単語文字列の反対検索、　re.Mで複数行検索
            word_counts=Counter(text.split()) #前後カット
            if self.stemming and stemmer is not None:
                stemmed_word_counts=Counter(text.split())#countライブラリ
                for word,count in word_counts.items():
                    stemmed_word=stemmer.stem(word)
                    stemmed_word_counts[stemmed_word]+=count
                word_counts=stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)  #numpyに直す

X_few=X_train[:3]
X_few_wordcounts=EmailToWordCounterTransformer().fit_transform(X_few)
#print(X_few_wordcounts)    


from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator,TransformerMixin):
    def __init__(self,vocabulary_size=1000):
        self.vocabulary_size=vocabulary_size
    def fit(self, X,y=None):
        total_count=Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word]+=min(count,10)
        most_common=total_count.most_common()[:self.vocabulary_size]
        self.most_common_=most_common
        self.vocabulary_ = {word: index+1 for index, (word,count) in enumerate(most_common)}
        return self
    def transform(self, X,y=None):
        rows=[]
        cols=[]
        data=[]
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word,0))
                data.append(count)
        return csr_matrix((data,(rows, cols)), shape=(len(X), self.vocabulary_size+1)) 
vocab_transformer=WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors=vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

from sklearn.pipeline import Pipeline

preprocess_pipeline=Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformer = preprocess_pipeline.fit_transform(X_train)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logreg=LogisticRegression(solver="liblinear", random_state=42)
logistic=cross_val_score(logreg,X_train_transformer,y_train,cv=3,verbose=3)
logistic.mean()


from sklearn.metrics import precision_score, recall_score
X_test_transformer=preprocess_pipeline.transform(X_test)
log_clf=LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_train_transformer,y_train)
log_pred=log_clf.predict(X_test_transformer)##トレーニングデータで訓練したモデル（前処理）を用いてテストデータの前処理を行う。
print("Precision: {:.2f}%".format(100*precision_score(y_test,log_pred)))
print("Recall: {:.2f}%".format(100*recall_score(y_test,log_pred)))

        
        
    


                    
                
                
 

    
    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................. , score=0.98125, total=   0.2s
[CV]  ................................................................
[CV] .................................. , score=0.98125, total=   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ................................... , score=0.9925, total=   0.0s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.5s finished


Precision: 93.94%
Recall: 97.89%




In [None]:
from collections import defaultdict
df_box=[]
for i in range(len(X_train)):
    words=X_train[i]
    word_count=defaultdict(int)
    for word in words.values():
        for word in word.split():
            word_count[word]+=1    
    df=pd.DataFrame.from_dict(word_count, orient="index").T
    df.index=[i]
    df_box+=df

for df in df_box:
    email=df

    
    
    SPAM_PATH=os.path.join("datasets","spam")
def load_data(filename,spam_path=SPAM_PATH):
    file_path=os.path.join(spam_path,filename)
    data=glob.glob(file_path+"/*")
    mail=[open(mail,"r",encoding="utf-8_sig",errors="ignore").read() for mail in data]    
    return mail

spam_data=load_data("spam")
easy_data=load_data("easy_ham")
hard_data=load_data("hard_ham")
