In [249]:
import numpy as np
import pandas as pd
import string

#### 
import nltk
# to trzeba ściągnąć, w cmd:
# >>> import nltk
# >>> nltk.download()
# For central installation, set this to C:\nltk_data

import itertools
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# transformatory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# estymator
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

# splitter
from sklearn.model_selection import train_test_split

# pipeline
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

## Przygotowanie danych tekstowych

### Usuwanie znaków interpunkcyjnych, takich jak:
'!"#$%&\'()*+,-./:;&lt;=>?@[\\]^_`{|}~'


In [250]:
# Zbiór znaków interpunkcyjnych
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [251]:
def remove_puncation(text):
    cleaned = ''.join([word for word in text if word not in string.punctuation])
    return cleaned

### Tokenizacja


In [252]:
def tokenize(text):

    # Usunięcie wielkich liter
    clean_text = text.lower()

    # Tokenizacja
    tokenized_text = nltk.word_tokenize(clean_text)
    return tokenized_text

### Usuwanie stopwords

In [253]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

179

In [254]:
def remove_stopwords(text):
    without_stopwords = [word for word in text if word not in stopwords]
    return without_stopwords

### Stemming
inaczej: proces tworzenia morfologicznych słów

In [255]:
stemmer = nltk.PorterStemmer()

def stemming(text):
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words

### Lematyzacja

In [256]:
lemmatter = nltk.WordNetLemmatizer()
def lemmatizing(text):
    lemmatized_words = [lemmatter.lemmatize(word) for word in text]
    return lemmatized_words

### Surowe dane

In [257]:
spam_dataset = pd.read_csv('spam.csv', encoding = "ISO-8859-1", usecols=[0, 1], names=['Spam', 'Text'],
                           skiprows=1)
spam_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Spam    5572 non-null   object
 1   Text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [258]:
spam_dataset['Spam'] = spam_dataset['Spam'].replace(['ham','spam'],[0,1])
spam_dataset.head(3)

Unnamed: 0,Spam,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [259]:
spam_dataset['Spam'].value_counts(normalize=True)

Spam
0    0.865937
1    0.134063
Name: proportion, dtype: float64

Usuwanie znaków interpunkcyjnych, takich jak: '!"#$%&\'()*+,-./:;&lt;=>?@[\\]^_`{|}~'


In [260]:
spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(lambda x: remove_puncation(x))

Tokenizacja

In [261]:
spam_dataset['Tokenized_Text'] = spam_dataset['Cleaned_Text'].apply(lambda x: tokenize(x))

Usuwanie stopwords

In [262]:
spam_dataset['WithoutStop_Text'] = spam_dataset['Tokenized_Text'].apply(lambda x: remove_stopwords(x))

Stemming (nie zastosowano, operacja ta "psuje" dane). Nie działa dokładnie tak, jak na stronie kursu.

In [263]:
# spam_dataset['Stemmed_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: stemming(x))
spam_dataset['Stemmed_Text'] = spam_dataset['WithoutStop_Text']

Lematyzacja

In [264]:
spam_dataset['Lemmatized_Text'] = spam_dataset['Stemmed_Text'].apply(lambda x: lemmatizing(x))
spam_dataset.head(3)

Unnamed: 0,Spam,Text,Cleaned_Text,Tokenized_Text,WithoutStop_Text,Stemmed_Text,Lemmatized_Text
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."


Usunięcie etapów pośrednich

In [265]:
data = spam_dataset.drop(['Cleaned_Text', 'Tokenized_Text', 'WithoutStop_Text', 'Stemmed_Text'], axis=1)

In [266]:
data['features'] = data.Lemmatized_Text.apply(lambda x: " ".join(x))
data.head(5)

Unnamed: 0,Spam,Text,Lemmatized_Text,features
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]",nah dont think go usf life around though


## Uczenie modeli

### Dane wejściowe

In [267]:
X, y = data.features, data.Spam

In [268]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42, stratify=y)

### Model wstepny - obliczenie feature importance

#### CountVectorizer()

Pipeline - jakie n-gramy

In [269]:
def bestestimator(transformer):
    cv = KFold(n_splits=5, shuffle=False)
    
    classifier = RandomForestClassifier(max_depth=2, random_state=42)

    pipeline = Pipeline(steps=[('transformer', transformer),
                                        ('classifier', classifier)])

    params = {'transformer__min_df': [0.001, 0.01],
            'transformer__max_df': [0.25, 0.5, 0.75],
            'transformer__ngram_range': [(1, 1),(1,2),(2,2)]}
    
    gridsearch = GridSearchCV(pipeline,
                                params,
                                scoring='f1',
                                cv=cv,
                                n_jobs=-1)
    gridsearch.fit(X_train, y_train)
    print("\nNajlepsze hiperparametry:", gridsearch.best_params_, "\n")

    model_best = gridsearch.best_estimator_
    
    return model_best

In [270]:
model_count_best = bestestimator(transformer = CountVectorizer())
print(f'Score on train data: {model_count_best.score(X_train, y_train)},\nscore on test data: {model_count_best.score(X_test, y_test)}')


Najlepsze hiperparametry: {'transformer__max_df': 0.25, 'transformer__min_df': 0.01, 'transformer__ngram_range': (1, 1)} 

Score on train data: 0.8721055465805062,
score on test data: 0.8735199138858988


In [271]:
model_tfid_best = bestestimator(transformer=TfidfVectorizer())
print(f'Score on train data: {model_tfid_best.score(X_train, y_train)},\nscore on test data: {model_tfid_best.score(X_test, y_test)}')


Najlepsze hiperparametry: {'transformer__max_df': 0.25, 'transformer__min_df': 0.01, 'transformer__ngram_range': (1, 1)} 

Score on train data: 0.8729133010231557,
score on test data: 0.8740581270182992


Wybieram transformer TfidfVectorizer()

In [272]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.25, ngram_range=(1,1))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

classifier = RandomForestClassifier(max_depth=2, random_state=42)
classifier.fit(X_train_vectorized, y_train)

classifier.score(X_train_vectorized, y_train), classifier.score(X_test_vectorized, y_test)

(0.8729133010231557, 0.8740581270182992)

In [273]:
results = pd.DataFrame([[classifier.score(X_train_vectorized, y_train), classifier.score(X_test_vectorized, y_test)]], columns=['train_score', 'test_score'], index=['tfidf_raw'])

In [274]:
results

Unnamed: 0,train_score,test_score
tfidf_raw,0.872913,0.874058


Obliczam feature importance

In [275]:
classifier.feature_importances_[:20]

array([0.00000000e+00, 0.00000000e+00, 2.09358737e-04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00766848e-04, 0.00000000e+00,
       1.51801572e-01, 1.66151412e-04, 0.00000000e+00, 2.75148970e-02,
       8.24331376e-02, 0.00000000e+00, 1.98292835e-03, 3.86128000e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [276]:
vectorizer.vocabulary_

{'find': 25,
 'tell': 100,
 'buy': 7,
 'oh': 73,
 'cant': 9,
 'every': 23,
 'day': 18,
 'know': 46,
 'get': 28,
 'babe': 5,
 'go': 30,
 'ive': 44,
 'like': 52,
 'im': 43,
 'phone': 77,
 'free': 26,
 'call': 8,
 'tc': 99,
 'dear': 19,
 'good': 33,
 'morning': 64,
 'said': 86,
 'amp': 2,
 'msg': 65,
 'even': 22,
 'got': 34,
 'back': 6,
 'still': 95,
 'today': 108,
 'send': 89,
 'min': 61,
 'dont': 21,
 'pls': 81,
 'need': 68,
 'come': 14,
 'hi': 39,
 'want': 115,
 'work': 123,
 'reply': 84,
 'co': 13,
 'life': 51,
 'lot': 55,
 'love': 56,
 'friend': 27,
 'tomorrow': 109,
 'lor': 54,
 'sorry': 94,
 'ill': 42,
 'later': 49,
 'ask': 4,
 'message': 60,
 'please': 80,
 'stop': 96,
 'one': 75,
 'mobile': 63,
 'service': 91,
 'per': 76,
 'wish': 121,
 'yes': 127,
 'well': 119,
 'way': 117,
 'anything': 3,
 'ltgt': 57,
 'take': 98,
 'contact': 15,
 'da': 17,
 'sure': 97,
 'ur': 112,
 'hope': 41,
 'care': 10,
 'text': 101,
 'make': 58,
 'home': 40,
 'night': 71,
 'going': 31,
 'time': 107,
 'hey'

### Model docelowy

Pozostawiam cechy o ważności $10^{-3}$

In [277]:
features_to_leave = np.where(classifier.feature_importances_>1e-3)
features_to_leave

(array([  8,  11,  12,  14,  15,  23,  26,  42,  43,  60,  61,  63,  65,
         69,  76,  80,  82,  84,  89,  91,  92,  94,  96,  99, 101, 111,
        112, 113, 118, 120], dtype=int64),)

In [278]:
vocabulary_trimmed = dict(zip(list(np.array(list(vectorizer.vocabulary_.keys()))[features_to_leave]),
    list(np.array(list(vectorizer.vocabulary_.values()))[features_to_leave])))
vocabulary_trimmed

{'get': 28,
 'ive': 44,
 'like': 52,
 'phone': 77,
 'free': 26,
 'msg': 65,
 'back': 6,
 'love': 56,
 'friend': 27,
 'way': 117,
 'anything': 3,
 'take': 98,
 'da': 17,
 'care': 10,
 'hey': 38,
 'txt': 111,
 'show': 92,
 'ok': 74,
 'claim': 12,
 'give': 29,
 'gon': 32,
 'thanks': 102,
 'feel': 24,
 'place': 79,
 'happy': 37,
 'tonight': 110,
 'wat': 116,
 'yeah': 125,
 'keep': 45,
 'didnt': 20}

In [279]:
words_left = list(np.array(list(vectorizer.vocabulary_.keys()))[features_to_leave])
print(words_left)

['get', 'ive', 'like', 'phone', 'free', 'msg', 'back', 'love', 'friend', 'way', 'anything', 'take', 'da', 'care', 'hey', 'txt', 'show', 'ok', 'claim', 'give', 'gon', 'thanks', 'feel', 'place', 'happy', 'tonight', 'wat', 'yeah', 'keep', 'didnt']


"wyczyszczone" listy słów z poszczególnych dokumentów

In [280]:
data['clean'] = data['Lemmatized_Text'].apply(lambda x: [j for j in x if j in words_left])
data['clean_features'] = data['clean'].apply(lambda x: " ".join(x))
data

Unnamed: 0,Spam,Text,Lemmatized_Text,features,clean,clean_features
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...,[wat],wat
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni,[ok],ok
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...,"[free, txt]",free txt
3,0,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say,[],
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]",nah dont think go usf life around though,[],
...,...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, 2, contact, u, u, å£750, po...",2nd time tried 2 contact u u å£750 pound prize...,[claim],claim
5568,0,Will Ì_ b going to esplanade fr home?,"[ì, b, going, esplanade, fr, home]",ì b going esplanade fr home,[],
5569,0,"Pity, * was in mood for that. So...any other s...","[pity, mood, soany, suggestion]",pity mood soany suggestion,[],
5570,0,The guy did some bitching but I acted like i'd...,"[guy, bitching, acted, like, id, interested, b...",guy bitching acted like id interested buying s...,"[like, free]",like free


Czyszczę DataFrame z pustych cech

In [281]:
data = data.loc[data['clean_features'] != '']

In [282]:
data.head(3)

Unnamed: 0,Spam,Text,Lemmatized_Text,features,clean,clean_features
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...,[wat],wat
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni,[ok],ok
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...,"[free, txt]",free txt


Liczba danych spada o ponad połowę

dane treningowe i testowe z nowego zbioru

In [283]:
X, y = data.clean_features, data.Spam
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42, stratify=y)

obliczenia dla wybranych parametrów

In [284]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.25, ngram_range=(1,1))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

classifier = RandomForestClassifier(max_depth=2, random_state=42)
classifier.fit(X_train_vectorized, y_train)

classifier.score(X_train_vectorized, y_train), classifier.score(X_test_vectorized, y_test)

(0.8460166468489893, 0.8406658739595719)

docelowy Pipeline

In [285]:
cv = KFold(n_splits=5, shuffle=False)

transformer = CountVectorizer(min_df=0.01, max_df=0.25, ngram_range=(1,1))
# transformer = TfidfVectorizer(min_df=0.01, max_df=0.25, ngram_range=(1,1))
classifier = RandomForestClassifier(max_depth=2, random_state=42)

pipeline = Pipeline(steps=[('transformer', transformer),
                            ('classifier', classifier)])

# pipeline.get_params()

Gridsearch

In [286]:
# 'max_df corresponds to < documents than min_df'

params = {'transformer__min_df': [0.0001, 0.001, 0.005],
        # 'transformer__max_df': [0.4, 0.5, 0.6, 0.7, 0.8, 0.99, 1.0],
        'transformer__max_df': [0.4, 0.6, 0.8, 0.99, 1.0],
        'transformer__ngram_range': [(1,1),(1,2),(2,2)]}

gridsearch = GridSearchCV(pipeline,
                            params,
                            scoring='f1',
                            cv=cv,
                            n_jobs=-1)
gridsearch.fit(X_train, y_train)
print("\nNajlepsze hiperparametry:", gridsearch.best_params_, "\n")

model_best = gridsearch.best_estimator_


Najlepsze hiperparametry: {'transformer__max_df': 0.4, 'transformer__min_df': 0.0001, 'transformer__ngram_range': (1, 1)} 



Ocena docelowego modelu

In [287]:
model_best.score(X_train, y_train), model_best.score(X_test, y_test)

(0.8460166468489893, 0.8406658739595719)

In [288]:
new_row = pd.DataFrame([[model_best.score(X_train, y_train), model_best.score(X_test, y_test)]], columns=['train_score', 'test_score'], index=['optimized'])

In [289]:
results = pd.concat([results, new_row])

In [290]:
results

Unnamed: 0,train_score,test_score
tfidf_raw,0.872913,0.874058
optimized,0.846017,0.840666


Wynik jest porównywalny, biorąc pod uwagę, że trochę danych wypadło.