## Training a sentiment analysis classifier based on supervised machine learning algorithms

In [1]:
import string

import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.svm import SVC

from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
stop_words = set(stopwords.words('spanish'))

In [4]:
def tokenizer(text):
    tt = TweetTokenizer()
    return tt.tokenize(text)

### Loading labeled tweets

In [5]:
# Dataset loaded from: https://docs.google.com/spreadsheets/d/11_E2ngsEOyTQwbwVVRHY5urzFF95BQCV/edit#gid=1788161364
tweets_df = pd.read_csv('./data/tweets_labeled.csv', sep = ',')

In [6]:
tweets_df.shape

(296, 2)

In [None]:
tweets_df.head()

In [None]:
tweets_df['sentiment'].value_counts(dropna = False, normalize = True)

### Leaving out unlabeled texts, this data is not useful for training or validating a supervised model

In [None]:
# Removing  unlabeled tweets
tweets_labeled_df = tweets_df.loc[tweets_df['sentiment'].notnull()]

In [None]:
tweets_labeled_df.shape

In [None]:
tweets_unlabeled_df = tweets_df.loc[tweets_df['sentiment'].isnull()]

In [None]:
tweets_unlabeled_df.shape

In [None]:
# Scenario 2: Working only with positive and negative classes
# Removing neutral class
tweets_labeled_df = tweets_labeled_df.loc[tweets_labeled_df['sentiment'].isin(['positive', 'negative'])]

### Splitting train and test datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets_labeled_df['full_text'], tweets_labeled_df['sentiment'], test_size = 0.2, stratify = tweets_labeled_df['sentiment'], random_state = 1)


In [None]:
X_train.shape

In [None]:
pd.Series(y_train).value_counts(normalize = True)

In [None]:
X_test.shape

In [None]:
pd.Series(y_test).value_counts(normalize = True)

### Vectorizing texts

<table>
    <tbody>
        <tr>
            <td>
                <h4>Bag of Words</h4>
                <img src="imgs/bow.png" style="width: 500px;">
            </td>
            <td>
                <h4>TF-IDF</h4>
                <img src="imgs/tf-idf.png" style="width: 500px;">
            </td>
        </tr>
    </tbody>
</table>

In [None]:
bow = CountVectorizer(tokenizer = tokenizer, stop_words = stop_words)

In [None]:
tfidf = TfidfVectorizer(tokenizer = tokenizer, stop_words = stop_words)

In [None]:
X_bow = bow.fit_transform(X_train)

In [None]:
X_tfidf = tfidf.fit_transform(X_train)

### Training and evaluating a model using BOW

In [None]:
model = SVC(kernel='rbf', degree=3, gamma='auto')

In [None]:
model.fit(X_bow, y_train)

In [None]:
y_train_bow_predict = model.predict(X_bow)
y_test_bow_predict = model.predict(bow.transform(X_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_bow_predict)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_bow_predict)

In [None]:
# Metrics calculation for more than two classes
print('Precision:', precision_score(y_test, y_test_bow_predict, average = None))
print('Recall:', recall_score(y_test, y_test_bow_predict, average = None))
print('F1:', f1_score(y_test, y_test_bow_predict, average = None))

### Training and evaluating a model using TF-IDF

In [None]:
model = SVC(kernel='rbf', degree=3, gamma='auto')

In [None]:
model.fit(X_tfidf, y_train)

In [None]:
y_train_tfidf_predict = model.predict(X_tfidf)
y_test_tfidf_predict = model.predict(bow.transform(X_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_tfidf_predict)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_tfidf_predict)

In [None]:
# Metrics calculation for more than two classes
print('Precision:', precision_score(y_test, y_test_tfidf_predict, average = None))
print('Recall:', recall_score(y_test, y_test_tfidf_predict, average = None))
print('F1:', f1_score(y_test, y_test_tfidf_predict, average = None))

### How interpret the results?

### Analyzing errors Bag of Words

In [None]:
error_df1 = pd.concat(
    [ pd.concat([X_test, y_test ], axis = 1).reset_index(),
    pd.Series(y_test_bow_predict) ]
, axis = 1).rename(columns = { 'sentiment': 'actual', 0: 'predicted' })

error_df1.drop('index', inplace = True, axis = 1)

In [None]:
error_df1.shape

In [None]:
error_df1.loc[error_df1['actual'] != error_df1['predicted']].head(20)

### Analyzing errors TF-IDF

In [None]:
error_df2 = pd.concat(
    [ pd.concat([X_test, y_test ], axis = 1).reset_index(),
    pd.Series(y_test_tfidf_predict) ]
, axis = 1).rename(columns = { 'sentiment': 'actual', 0: 'predicted' })

error_df2.drop('index', inplace = True, axis = 1)

In [None]:
error_df2.shape

In [None]:
error_df2.loc[error_df2['actual'] != error_df2['predicted']].head(20)