In [1]:
import nltk  
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords  
from nltk import word_tokenize  
from nltk.data import load  
from nltk.stem import SnowballStemmer  
from string import punctuation  
from sklearn.feature_extraction.text import CountVectorizer       
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC

# Spanish sentiment analysis
Classification of political sentiment is extremely difficult task. It is also highly questionable to use simplistic sentiment scales combined with classifying algorithms to represent real political stances. However our purpose here is to mainly just try what kind of results simple algorithms like scikit-learn's linear SVC could yield us with very limited amount of training data. 

### Training data
Models are only as good as the training data that they have been trained with.
As the spanish training data we are using the TASS dataset which is annual machine learning workshop for spanish natural language processing enthusiastics and researchers arranged by Spanish Society for Natural Language Processing [SEPLN]. This training data set is collected from 2011-2013 and it has global coverage of spanish-speaking countries. Messages also cover various different areas of life from Spain's interior politics to world politics and economics. 

* This is based on [Manuel Garrido's great blog-post.](http://blog.manugarri.com/sentiment-analysis-in-spanish/) 

Let's start by reading in all the labeled training data we have + the dataset that we want classify

In [3]:
train_data1 = pd.read_csv("general-tweets-train-tagged.csv")
train_data2 = pd.read_csv("general-tweets-train2-tagged.csv")
train_data3 = pd.read_csv("socialtv-tweets-train-tagged.csv")
train_data4 = pd.read_csv("stompol-tweets-train-tagged.csv")

In [None]:
# Set col width
pd.options.display.max_colwidth = 300

##### Merge training data to one set

In [4]:
train_data_set = pd.concat([train_data1,train_data2,train_data3,train_data4])

### Pre-Processing TASS training data

In [5]:
from sklearn.utils import shuffle
train_data_set = shuffle(train_data_set, random_state=224)

In [6]:
train_data_set = train_data_set.query('agreement != "DISAGREEMENT" and polarity != "NONE"')

In [7]:
train_data_set = train_data_set[train_data_set.polarity != 'NEU']

train_data_set['pol_polarity'] = [0 for i in range(len(train_data_set))]
train_data_set.pol_polarity[train_data_set.polarity.isin(['P', 'P+'])] = 1
train_data_set.pol_polarity.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1    0.543326
0    0.456674
Name: pol_polarity, dtype: float64

In [8]:
# train_data, test_data = train_test_split(train_data_set, test_size=0.2)
train_data = train_data_set

In [9]:
train_data.shape

(7986, 4)

### Pre-Processing the data from Catalonia
Our classifier is based on spanish training data, so we want to classify only the portion of our real-world data that is spanish and whose coordinates are within the geographical area of spain. For this purpose we will filter based on Spain's coordinates:

In [31]:
data = pd.read_json("8oct_pre_processed_stemmed.json", lines=True, orient='records')
data_es = data[data['lang'] == 'es']

In [32]:
max_lat = 43.780513
min_lat = 36.317416
max_lon = 3.139475
min_lon = -9.278242

In [33]:
df_real = data[(data['coordinates'].notnull()) & (data['lang'] == 'es')]

In [34]:
bool_array = []
for i in range(df_real.shape[0]):
    coord = df_real.iloc[i,0]['coordinates']
    if (coord[0] > min_lon) & (coord[0] < max_lon) & (coord[1] > min_lat) & (coord[1] < max_lat):
        bool_array.append(True)
    else:
        bool_array.append(False)

In [35]:
df_real = df_real[bool_array]

In [36]:
df_real.shape

(49, 29)

### Setting up a pipeline for text processing

In [25]:
text_stemmed = data['text']

In [14]:
#stopword list to use
spanish_stopwords = stopwords.words('spanish')

#spanish stemmer
stemmer = SnowballStemmer('spanish')

#punctuation to remove
non_words = list(punctuation)  

#we add spanish punctuation
non_words.extend(['¿', '¡'])  
non_words.extend(map(str,range(10)))

stemmer = SnowballStemmer('spanish')  

def stem_tokens(tokens, stemmer):  
    stemmed = []

    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):  
    
    # remove punctuation
    text = ''.join([c for c in text if c not in non_words])
    
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

vectorizer = CountVectorizer(  
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

## GridsearchCV
We used a parameter optimization tool that is integrated into the sklearn library. 

Here we set up the parameter-search with GridsearchCV. 

In [27]:
from sklearn.model_selection import GridSearchCV

In [21]:
pipeline = Pipeline([  
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])

# here we define the parameter space to iterate through
parameters = {  
#     'vect__max_df': (0.5, 1.9),
    'vect__max_df': (0.3, 0.6, 1.4),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (250, 500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__C': (0.2, 0.5, 0.7),
    'cls__loss': ('hinge', 'squared_hinge'),
    'cls__max_iter': (500, 1000)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')  
grid_search.fit(train_data.content, train_data.pol_polarity)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.3, 0.6, 1.4), 'vect__min_df': (10, 20, 50), 'vect__max_features': (250, 500, 1000), 'vect__ngram_range': ((1, 1), (1, 2)), 'cls__C': (0.2, 0.5, 0.7), 'cls__loss': ('hinge', 'squared_hinge'), 'cls__max_iter': (500, 1000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

## Best parameter-values
We had to search through a couple of different runs by gridsearch. This was because at first we only tried to use the labeled 'political' training data offered by TASS. However that provided poor results with the test set, accuracy of 0.64. After that we combined the training datasets and in the end managed to acquire a 0.82 accuracy.

In [22]:
grid_search.best_params_

{'cls__C': 0.2,
 'cls__loss': 'hinge',
 'cls__max_iter': 1000,
 'vect__max_df': 0.6,
 'vect__max_features': 1000,
 'vect__min_df': 10,
 'vect__ngram_range': (1, 1)}

In [24]:
# New patch with old parameters gives us 0.79
model = LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
              random_state=None,
              penalty='l2',
              tol=0.0001
)

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 50,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

train_data_features = vectorizer.fit_transform(train_data.content)
train_data_features_nd = train_data_features.toarray()

In [25]:
scores = cross_val_score(
    model,
    train_data_features_nd[0:len(train_data)],
    y=train_data.pol_polarity,
    scoring='roc_auc',
    cv=5
    )

scores.mean()

0.7932710446072424

In [16]:
# New patch with new gridsearch parameters gives us: 0.82
model = LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
              random_state=None,
              penalty='l2',
              tol=0.0001
)

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 10,
    max_df = 0.6,
    ngram_range=(1, 1),
    max_features=1000
)

train_data_features = vectorizer.fit_transform(train_data.content)
train_data_features_nd = train_data_features.toarray()

In [17]:
scores = cross_val_score(
    model,
    train_data_features_nd[0:len(train_data)],
    y=train_data.pol_polarity,
    scoring='roc_auc',
    cv=5
    )

scores.mean()

0.8257057505433586

In [26]:
saved_params = model.get_params()

# Classification of our data
Let's classify a sample

In [72]:
test_data2 = pd.DataFrame(columns=('content', 'polarity'))
test_data2['content'] = list(df_real['text'])

In [73]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 0.6,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             tol=0.0001
             )),
])

pipeline.fit(train_data.content, train_data.pol_polarity)

test_data2['polarity'] = pipeline.predict(test_data2.content)

In [114]:
test_data2[1:50]

Unnamed: 0,content,polarity,coordinates,tweet_id
1,📈 Top 10 de búsquedas en ⒼⓄⓄⒼⓁⒺ durante las últimas 24 horas:\n\n➊ Memes Puigdemont\n➋ Portugal\n\n2017/10/11 00:04 CEST,0,,917873384762908672
2,"📈 Con al menos 10000 búsquedas ""Memes Puigdemont"" es tendencia en Google España https://t.co/lwH0BP7JlW",1,,917872894452965376
3,🇪🇸CATALUÑ🅰️ Y LA ÚLTIMA H⭕️RA ESPAÑOL🅰️\n▪️Mariano Rajoy y Pedro Sánchez se reúnen en Moncloa… https://t.co/wEvH7IicP9,0,,917871440610938881
4,"🇪🇸ESPAÑ🅰️ — EL DEB🅰️TE‼️🇪🇸\n¿QUÉ ⭕️PINAS DE LA DECLARACIÓN DE INDEPENDENCIA EN C🅰️TALUÑ🅰️❓\n—Yo,… https://t.co/e595vYeYCx",0,,917895608362442752
5,"🇪🇸ESPAÑ🅰️ — EL DEB🅰️TE‼️🇪🇸\n¿QUÉ ⭕️PINAS DE LA DECLARACIÓN DE INDEPENDENCIA EN C🅰️TALUÑ🅰️❓\n—Yo,… https://t.co/lfz9XoQ4lb",0,,917895422470967296
6,"Lo siento , pero me lo pusieron a huevo :) #memes #humor #comedia #comedy #catalonia #catalunya… https://t.co/qOtJQ3uoD0",0,,917895099752869888
7,"🇪🇸ESPAÑ🅰️ — EL DEB🅰️TE‼️🇪🇸\n¿QUÉ ⭕️PINAS DE LA DECLARACIÓN DE INDEPENDENCIA EN C🅰️TALUÑ🅰️❓\n—Yo,… https://t.co/g2sdIJiRcc",0,,917895088176488448
8,"FANS DEL JJ! 🙌🙌 en Santa María De Palautordera, Cataluna, Spain https://t.co/kuOTVpNG20",0,,917869495695192065
9,@Santi_ABASCAL cuando nos convoca una manifestación contra el aliado de los golpistas ;RAJOY.,0,,917868862175096837
10,La independencia es muy independiente y mucho independiente,1,,917867505523281921


In [74]:
test_data2['polarity'].sum()

37

In [75]:
len(test_data2[test_data2['polarity'] == 1]) / test_data2.shape[0]

0.3274336283185841

In [76]:
len(test_data2[test_data2['polarity'] == 0]) / test_data2.shape[0]

0.672566371681416

The initial patch seems to work.
## Classification of all data

In [37]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 0.6,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             tol=0.0001
             )),
])

pipeline.fit(train_data.content, train_data.pol_polarity)

data_es['polarity'] = pipeline.predict(data_es.text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [38]:
data_es[data_es['polarity'] == 1]['polarity'].sum()

179318

In [39]:
data_es.to_json('8oct_es_sentiment.json', orient='records', lines=True)