### Выполнил: Молчанов АВ
### Задание: Необходимо решить задачу классификации текстов, сформировав два варианта векторизации признаков - на основе CountVectorizer и на основе TfidfVectorizer. В качестве классификаторов необходимо использовать два классификатора: KNeighborsClassifier, Complement Naive Bayes

In [1]:
import os
import gzip
import shutil

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('spam_or_not_spam.csv')
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


Feature preparation

In [3]:
tfidfv = TfidfVectorizer()
tfidf_ngram_features = tfidfv.fit_transform(data['email'].values.astype('U'))
tfidf_ngram_features

<3000x34117 sparse matrix of type '<class 'numpy.float64'>'
	with 348377 stored elements in Compressed Sparse Row format>

In [4]:
countvec = CountVectorizer()
countvec_ngram_features = countvec.fit_transform(data['email'].values.astype('U'))
countvec_ngram_features

<3000x34117 sparse matrix of type '<class 'numpy.int64'>'
	with 348377 stored elements in Compressed Sparse Row format>

KNeighboursClassifier

In [5]:
# TFIDF + KNC
X_train, X_test, y_train, y_test = train_test_split(tfidf_ngram_features, data['label'], test_size=0.3, random_state=1)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           0     0.9872    0.9264    0.9558       747
           1     0.7236    0.9412    0.8182       153

    accuracy                         0.9289       900
   macro avg     0.8554    0.9338    0.8870       900
weighted avg     0.9424    0.9289    0.9324       900



In [6]:
# CountVec + KNC
X_train, X_test, y_train, y_test = train_test_split(countvec_ngram_features, data['label'], 
                                                    test_size=0.3, random_state=1)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           0     0.9645    0.9451    0.9547       747
           1     0.7560    0.8301    0.7913       153

    accuracy                         0.9256       900
   macro avg     0.8602    0.8876    0.8730       900
weighted avg     0.9290    0.9256    0.9269       900



Complement Naive Bayes

In [7]:
# TFIDF + CNB
X_train, X_test, y_train, y_test = train_test_split(tfidf_ngram_features, data['label'], test_size=0.3, random_state=1)
model = ComplementNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           0     0.8830    1.0000    0.9379       747
           1     1.0000    0.3529    0.5217       153

    accuracy                         0.8900       900
   macro avg     0.9415    0.6765    0.7298       900
weighted avg     0.9029    0.8900    0.8671       900



In [8]:
# CountVec + CNB
X_train, X_test, y_train, y_test = train_test_split(countvec_ngram_features, data['label'], 
                                                    test_size=0.3, random_state=1)
model = ComplementNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           0     0.9907    0.9973    0.9940       747
           1     0.9865    0.9542    0.9701       153

    accuracy                         0.9900       900
   macro avg     0.9886    0.9758    0.9820       900
weighted avg     0.9900    0.9900    0.9899       900



### Complement Naive Bayes показал лучший результат