# Лабораторная работа №10. Основы обработки естественного языка (NLP). Задача тематического моделирования.

In [31]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [3]:
df = pd.read_csv("../data/spam.csv", encoding = "ISO-8859-1")
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


удаляю лишние столбцы, содержащие NaN

In [4]:
df.drop(df.columns[[2, 3, 4]], axis = 1, inplace = True )
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


Перевод категориальных данных столбца v1 в бинарные

In [5]:
df["v1"]= df['v1'].replace('ham', 0)
df["v1"]= df['v1'].replace('spam', 1)
df

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


## Bag of words

In [7]:
nlp = spacy.load("en_core_web_sm")

def lemmatize(string):
    return " ".join([word.lemma_ for word in nlp(string)])

In [8]:
lemmatize("City of stars, are you shining just for me?")

'city of star , be you shine just for I ?'

In [9]:
df["v2"] = df["v2"].apply(lemmatize)

count_vectorizer = CountVectorizer(max_features=25, stop_words='english')
bow = count_vectorizer.fit_transform(df['v2']).A
count_vectorizer.vocabulary_ # сопоставление слов с индексами признаков

{'ok': 13,
 'free': 2,
 'text': 18,
 'txt': 22,
 'say': 14,
 'think': 19,
 'like': 8,
 'send': 15,
 'home': 5,
 'want': 24,
 'today': 21,
 'day': 1,
 'time': 20,
 'make': 11,
 'ur': 23,
 'just': 6,
 'know': 7,
 'love': 9,
 'tell': 17,
 'need': 12,
 'lt': 10,
 'gt': 4,
 'good': 3,
 'come': 0,
 'stop': 16}

In [11]:
df_bow = pd.DataFrame(bow, columns=count_vectorizer.get_feature_names_out())
df_bow

Unnamed: 0,come,day,free,good,gt,home,just,know,like,love,...,send,stop,tell,text,think,time,today,txt,ur,want
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5568,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
target=df["v1"]
target

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5572, dtype: int64

In [16]:
# понижаю размерность предикторов
pca = PCA(n_components=7)
X_pca = pca.fit_transform(df_bow, target)

print(df_bow.shape)
print(X_pca.shape)

(5572, 25)
(5572, 7)


In [None]:
#провожу классификацию с помощью бэггинга
X_train, X_test, y_train, y_test = train_test_split(X_pca, target, test_size=0.33)
bag = BaggingClassifier().fit(X_train, y_train)
print(classification_report(y_test, bag.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1590
           1       0.76      0.38      0.51       249

    accuracy                           0.90      1839
   macro avg       0.84      0.68      0.72      1839
weighted avg       0.89      0.90      0.88      1839



## TF-IDF

In [27]:
tfidf_vectorizer = TfidfVectorizer(max_features=25)
tfidf = tfidf_vectorizer.fit_transform(df['v2']).A
tfidf_vectorizer.vocabulary_

{'go': 8,
 'in': 10,
 'get': 7,
 'to': 20,
 'so': 17,
 'do': 5,
 'not': 13,
 'it': 11,
 'be': 1,
 'now': 14,
 'and': 0,
 'you': 23,
 'for': 6,
 'my': 12,
 'your': 24,
 'have': 9,
 'call': 3,
 'the': 19,
 'on': 16,
 'will': 22,
 'that': 18,
 'can': 4,
 'we': 21,
 'but': 2,
 'of': 15}

In [28]:
df_tfidf = pd.DataFrame(bow, columns=count_vectorizer.get_feature_names_out())
df_tfidf


Unnamed: 0,come,day,free,good,gt,home,just,know,like,love,...,send,stop,tell,text,think,time,today,txt,ur,want
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5568,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
pca_tf = PCA(n_components=7)
X_pca_tf = pca.fit_transform(df_tfidf, target)

print(df_tfidf.shape)
print(X_pca_tf.shape)

(5572, 25)
(5572, 7)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_pca_tf, target, test_size=0.33)
bag_tf = BaggingClassifier().fit(X_train, y_train)
print(classification_report(y_test, bag_tf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96      1605
           1       0.90      0.46      0.61       234

    accuracy                           0.92      1839
   macro avg       0.91      0.72      0.78      1839
weighted avg       0.92      0.92      0.91      1839



## LDA

In [32]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [33]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [35]:
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0).fit(df_tfidf)

In [36]:
# матрица темы-слова
lda.components_.shape

(10, 25)

In [38]:
# матрица документы-темы
lda.transform(df_tfidf).shape

(5572, 10)

In [40]:
print_top_words(lda, tfidf_vectorizer.get_feature_names_out(), 10)

Topic #0:
and now so go to not be we do your
Topic #1:
for my you get so it now to of do
Topic #2:
be we of call get have can in you now
Topic #3:
call go the to on get now for have it
Topic #4:
your and that do get so for of to it
Topic #5:
can in to go my it now call but and
Topic #6:
not have it do be and to the get call
Topic #7:
of that on for but the now will you get
Topic #8:
you get of will go not so call have it
Topic #9:
but will you on that so for of we be
