<a href="https://colab.research.google.com/github/DallasAutumn/datawhale_salons/blob/master/NLP_s8_task4(b).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

from collections import defaultdict

from gensim import corpora, models, similarities

from itertools import chain

from pprint import pprint

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC

import csv
import logging
import nltk
import os
import pandas as pd
import pathlib

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')
%pprint on

Pretty printing has been turned OFF


In [0]:
stoplist = stopwords.words('english')
def preprocessing(raw_text) -> str:
    #raw_text = raw_text.decode("utf8")
    tokens = [
        word for sent in sent_tokenize(raw_text)
        for word in word_tokenize(sent)
    ]
    tokens = [
        token for token in tokens if (token not in stoplist and len(token) > 3)
    ]
    tokens = [word.lower() for word in tokens]
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


def load_data(filepath):
    pass

In [0]:
sms_path = str(pathlib.Path(
    os.getcwd()).parent) + ('\\datasets\\smsspamcollection\\SMSSpamCollection')
sms_df = pd.read_csv(sms_path, sep='\t')
sms_df.columns = ['label', 'content']
X, y = sms_df['content'].apply(preprocessing), sms_df['label']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

In [0]:
vectorizer = TfidfVectorizer(min_df=2,
                             ngram_range=(1, 2),
                             stop_words='english',
                             strip_accents='unicode',
                             norm='l2')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [0]:
clf = MultinomialNB().fit(X_train, y_train)
y_nb_predicted = clf.predict(X_test)
print(confusion_matrix(y_test, y_nb_predicted))
print(classification_report(y_test, y_nb_predicted))

[[1453    0]
 [  45  174]]
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1453
        spam       1.00      0.79      0.89       219

   micro avg       0.97      0.97      0.97      1672
   macro avg       0.98      0.90      0.94      1672
weighted avg       0.97      0.97      0.97      1672



In [0]:
svm_clf = LinearSVC().fit(X_train, y_train)
y_svm_predicted = svm_clf.predict(X_test)
print(confusion_matrix(y_test, y_svm_predicted))
print(classification_report(y_test, y_svm_predicted))

[[1446    7]
 [  24  195]]
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1453
        spam       0.97      0.89      0.93       219

   micro avg       0.98      0.98      0.98      1672
   macro avg       0.97      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [0]:
n_clusters = 5
km = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1, n_jobs=-1, verbose=logging.INFO).fit(X_train)
clustering = defaultdict(list)
for idx, label in enumerate(km.labels_):
    clustering[label].append(idx)
clustering

defaultdict(<class 'list'>, {1: [0, 149, 172, 227, 232, 333, 364, 370, 375, 446, 469, 493, 498, 568, 594, 715, 863, 972, 1022, 1027, 1157, 1180, 1203, 1227, 1252, 1416, 1504, 1563, 1566, 1700, 1799, 1906, 1940, 1980, 1999, 2000, 2022, 2068, 2080, 2099, 2109, 2124, 2127, 2151, 2213, 2243, 2307, 2335, 2364, 2370, 2470, 2506, 2512, 2675, 2710, 2868, 3012, 3100, 3120, 3150, 3404, 3469, 3726, 3812], 0: [1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 121, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 145, 146, 147, 151, 152, 153,

In [0]:
documents = list(X)
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus=corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf

2019-07-18 17:06:03,343 INFO adding document #0 to Dictionary(0 unique tokens: [])
2019-07-18 17:06:03,428 INFO built Dictionary(7988 unique tokens: ['joking', '08452810075over18', '2005', '21st', '87121']...) from 5571 documents (total 37430 corpus positions)
2019-07-18 17:06:03,478 INFO collecting document frequencies
2019-07-18 17:06:03,480 INFO PROGRESS: processing document #0
2019-07-18 17:06:03,490 INFO calculating IDF weights for 5571 documents and 7987 features (35952 matrix non-zeros)


<gensim.interfaces.TransformedCorpus object at 0x000001CF632198D0>

In [0]:
n_topics = 5
lda = models.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics)
lda.print_topics()

2019-07-18 17:17:55,288 INFO using symmetric alpha at 0.2
2019-07-18 17:17:55,289 INFO using symmetric eta at 0.2
2019-07-18 17:17:55,292 INFO using serial LDA version on this node
2019-07-18 17:17:55,437 INFO running online (single-pass) LDA training, 5 topics, 1 passes over the supplied corpus of 5571 documents, updating model once every 2000 documents, evaluating perplexity every 5571 documents, iterating 50x with a convergence threshold of 0.001000
2019-07-18 17:17:55,544 INFO PROGRESS: pass 0, at document #2000/5571
2019-07-18 17:17:56,358 INFO merging changes from 2000 documents into a model of 5571 documents
2019-07-18 17:17:56,369 INFO topic #0 (0.200): 0.005*"well" + 0.004*"time" + 0.004*"waiting" + 0.004*"good" + 0.004*"call" + 0.004*"guess" + 0.003*"number" + 0.003*"know" + 0.003*"later" + 0.003*"think"
2019-07-18 17:17:56,371 INFO topic #1 (0.200): 0.005*"like" + 0.004*"know" + 0.003*"home" + 0.003*"call" + 0.003*"tell" + 0.003*"dont" + 0.003*"great" + 0.003*"come" + 0.003*

[(0, '0.005*"number" + 0.005*"time" + 0.004*"call" + 0.003*"waiting" + 0.003*"guess" + 0.003*"please" + 0.003*"stuff" + 0.003*"well" + 0.003*"liao" + 0.003*"detail"'), (1, '0.006*"home" + 0.005*"like" + 0.004*"back" + 0.004*"know" + 0.004*"tell" + 0.004*"call" + 0.004*"okie" + 0.003*"dear" + 0.003*"leave" + 0.003*"yeah"'), (2, '0.010*"later" + 0.008*"sorry" + 0.008*"anything" + 0.007*"call" + 0.006*"going" + 0.005*"tomorrow" + 0.004*"night" + 0.004*"come" + 0.004*"today" + 0.004*"take"'), (3, '0.008*"call" + 0.006*"free" + 0.006*"meeting" + 0.005*"text" + 0.004*"want" + 0.004*"happy" + 0.004*"stop" + 0.004*"fine" + 0.004*"princess" + 0.003*"mobile"'), (4, '0.006*"love" + 0.006*"come" + 0.005*"pick" + 0.004*"phone" + 0.004*"good" + 0.004*"want" + 0.004*"send" + 0.004*"dont" + 0.003*"home" + 0.003*"time"')]

In [0]:
lda.show_topics()[0][1]

'0.005*"number" + 0.005*"time" + 0.004*"call" + 0.003*"waiting" + 0.003*"guess" + 0.003*"please" + 0.003*"stuff" + 0.003*"well" + 0.003*"liao" + 0.003*"detail"'