In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train = pd.read_csv("/twitter-entity-sentiment-analysis/twitter_training.csv",
                   names = ['number',
                            'source',
                            'sentiment',
                            'text'])
val = pd.read_csv("/twitter-entity-sentiment-analysis/twitter_validation.csv",
                   names = ['number',
                            'source',
                            'sentiment',
                            'text'])


In [None]:
classify = lambda s: 1 if s == "Positive" else\
                     0 if s == "Negative" else\
                     2 if s == "Neutral" else 3

y_val = val["sentiment"].apply(classify)

y_train = train["sentiment"].apply(classify)

In [None]:
tfidf = TfidfVectorizer(max_features=900, sublinear_tf = True)

X_train = tfidf.fit_transform(train["text"].values.astype('U'))
X_val = tfidf.transform(val["text"].values.astype('U'))

I found that removing stopwords, punctuation, hashtags, and urls had a negative overall effect on model performance, so I have ommitted this step. Instead, I rely on T-SVD to reduce the dimenstionality of the sparse matrix created by the TFIDF step.

In [None]:
## apply dimensionality reduction - TSVD is useful for applying to sparse matrices

tsvd = TruncatedSVD(n_components = 280)

X_train_svd = tsvd.fit_transform(X_train)
X_val_svd = tsvd.transform(X_val)

In [None]:
model = XGBClassifier()

model.fit(X_train_svd, y_train)

In [None]:
y_pred = model.predict(X_val_svd)

In [None]:
confusion_matrix(y_val, y_pred, normalize='true')

array([[0.95488722, 0.01879699, 0.02255639, 0.0037594 ],
       [0.05054152, 0.87725632, 0.05054152, 0.02166065],
       [0.07719298, 0.02807018, 0.88421053, 0.01052632],
       [0.06395349, 0.06395349, 0.04069767, 0.83139535]])

In [None]:
accuracy_score(y_val, y_pred)

0.892