### Задание:
Для произвольного набора данных, предназначенного для классификации текстов, решите задачу классификации текста двумя способами:

- Способ 1. На основе CountVectorizer или TfidfVectorizer.
- Способ 2. На основе моделей word2vec или Glove или fastText.
- Сравните качество полученных моделей.

In [1]:
import numpy as np
import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import time
import gensim.downloader as api
from tqdm import tqdm
import sys

In [2]:
# Загрузка данных
train_data = pd.read_csv('data/Corona_NLP_train.csv', encoding='latin1')
test_data = pd.read_csv('data/Corona_NLP_test.csv', encoding='latin1')

In [3]:
train_data.shape

(41157, 6)

In [4]:
train_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [5]:
test_data.shape

(3798, 6)

In [6]:
test_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [7]:
X_train = train_data['OriginalTweet']
y_train = train_data['Sentiment']

X_test = test_data['OriginalTweet']
y_test = test_data['Sentiment']

In [8]:
def check_missing(data, name):
    missing = data.isnull().sum()
    print(f'У {name} {missing} пропущенных строк')

In [9]:
check_missing(train_data, 'train_data')
check_missing(test_data, 'test_data')
check_missing(X_train, 'X_train')
check_missing(X_test, 'X_test')
check_missing(y_train, 'y_train')
check_missing(y_test, 'y_test')

У train_data UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64 пропущенных строк
У test_data UserName           0
ScreenName         0
Location         834
TweetAt            0
OriginalTweet      0
Sentiment          0
dtype: int64 пропущенных строк
У X_train 0 пропущенных строк
У X_test 0 пропущенных строк
У y_train 0 пропущенных строк
У y_test 0 пропущенных строк


In [10]:
# Векторизация с помощью CountVectorizer
count_vect = CountVectorizer() 
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

# Векторизация с помощью TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [11]:
# Загрузка предобученных моделей
word2vec_model = api.load('word2vec-google-news-300')
glove_model = api.load('glove-twitter-200')
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

In [12]:
# Функция для усреднения векторов слов в тексте
def vectorize_text(text, model):
    words = text.split()
    vectors = [model[word] for word in words if word in model]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Векторизация данных
def vectorize_dataset(dataset, model, desc="Vectorizing"):
    return np.array([vectorize_text(text, model) for text in tqdm(dataset, desc=desc)])

In [13]:
X_train_w2v = vectorize_dataset(X_train, word2vec_model, desc="Vectorizing word2vec")
X_test_w2v = vectorize_dataset(X_test, word2vec_model, desc="Vectorizing word2vec")

X_train_glove = vectorize_dataset(X_train, glove_model, desc="Vectorizing Glove")
X_test_glove = vectorize_dataset(X_test, glove_model, desc="Vectorizing Glove")

X_train_fasttext = vectorize_dataset(X_train, fasttext_model, desc="Vectorizing fastText")
X_test_fasttext = vectorize_dataset(X_test, fasttext_model, desc="Vectorizing fastText")

Vectorizing word2vec: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41157/41157 [00:01<00:00, 30472.68it/s]
Vectorizing word2vec: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3798/3798 [00:00<00:00, 31254.35it/s]
Vectorizing Glove: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41157/41157 [00:01<00:00, 34204.91it/s]
Vectorizing Glove: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3798/3798 [00:00<00:00, 33523.72it/s]
Vectorizing fastText: 100%|█████████████████████████████████████████████████

In [14]:
# Функции для оценки точности для каждой метки
def accuracy_score_for_classes(y_true, y_pred):
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    classes = np.unique(y_true)
    res = dict()
    for c in classes:
        temp_data_flt = df[df['t'] == c]
        temp_acc = accuracy_score(temp_data_flt['t'].values, temp_data_flt['p'].values)
        res[c] = temp_acc
    return res

In [15]:
def print_accuracy_score_for_classes(y_true, y_pred):
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs) > 0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [16]:
# Оценка моделей
def evaluate_model(vectorizer_name, vectorizer_train, vectorizer_test, model, model_name):
    start_time = time.time()
    obj_model = model
    obj_model.fit(vectorizer_train, y_train)
    predictions = obj_model.predict(vectorizer_test)
    
    accuracy = accuracy_score(y_test, predictions)
    duration = (time.time() - start_time) / 60
    
    print(f'Точность: {accuracy:.4f}, время обучения классификатора: {duration:.2f} мин. ({vectorizer_name} + {model_name})')
    print_accuracy_score_for_classes(y_test, predictions)

In [17]:
classifiers = {
    "RandomForestClassifier": RandomForestClassifier(),
    "LinearSVC": LinearSVC(max_iter=10000),
    "LogisticRegression": LogisticRegression(max_iter=10000)
}

vectorizers = {
    "CountVectorizer": (X_train_counts, X_test_counts),
    "TfidfVectorizer": (X_train_tfidf, X_test_tfidf),
    "word2vec": (X_train_w2v, X_test_w2v),
    "glove": (X_train_glove, X_test_glove),
    "fastText": (X_train_fasttext, X_test_fasttext)
}

In [18]:
for vec_name, (train_vec, test_vec) in vectorizers.items():
    for clf_name, clf in classifiers.items():
        evaluate_model(vec_name, train_vec, test_vec, clf, clf_name)

Точность: 0.4513, время обучения классификатора: 1.26 мин. (CountVectorizer + RandomForestClassifier)
Метка 	 Accuracy
Extremely Negative 	 0.20270270270270271
Extremely Positive 	 0.24874791318864775
Negative 	 0.46397694524495675
Neutral 	 0.6252019386106623
Positive 	 0.6071805702217529
Точность: 0.5284, время обучения классификатора: 0.32 мин. (CountVectorizer + LinearSVC)
Метка 	 Accuracy
Extremely Negative 	 0.589527027027027
Extremely Positive 	 0.6260434056761269
Negative 	 0.43419788664745435
Neutral 	 0.6348949919224556
Positive 	 0.4625131995776135
Точность: 0.6087, время обучения классификатора: 0.49 мин. (CountVectorizer + LogisticRegression)
Метка 	 Accuracy
Extremely Negative 	 0.5472972972972973
Extremely Positive 	 0.6126878130217028
Negative 	 0.5629202689721422
Neutral 	 0.715670436187399
Positive 	 0.6251319957761352
Точность: 0.4379, время обучения классификатора: 1.09 мин. (TfidfVectorizer + RandomForestClassifier)
Метка 	 Accuracy
Extremely Negative 	 0.258445945

### Итог
Наилучший результат получен с использованием CountVectorizer и LogisticRegression.