In [1]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('spanish'))
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
import seaborn as sns

In [2]:
df = pd.read_csv('E-c-Es-train.csv')
df_test = pd.read_csv('E-c-Es-dev.csv')

In [3]:
df_new = df.drop(['ID', 'Tweet'], axis=1)
counts = []
categories = list(df_new.columns.values)
for i in categories:
    counts.append((i, df[i].sum()))
df_stats = pd.DataFrame(counts, columns=['Emotion', 'no. of tweets'])
df_stats

Unnamed: 0,Emotion,no. of tweets
0,anger,1155
1,anticipation,415
2,disgust,521
3,fear,373
4,joy,1085
5,love,260
6,optimism,377
7,pessimism,578
8,sadness,845
9,surprise,169


In [4]:
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(df['Tweet']).head()

Unnamed: 0,Tweet
0,"@aliciaenp Ajajjaa somos del clan twitteras perdidas pa eventos ""importantes"""
1,@AwadaNai la mala suerte del gato fichame la cara de help me pls
2,@audiomano A mí tampoco me agrado mucho eso. Especialmente por tratarse de él. No hay justificación.
3,"Para llevar a los bebes de un lugar a otro debemos cantarles canciones... Quiero cantarles Gunaa' nibiina (La llorona, en Zapoteco)"
4,"@DalasReview me encanta la terrible hipocresia y doble moral que tiene esta gente, claro, cuando ella te lo quita ILEGALMENTE no importa..."


In [5]:
import emoji

# Function for converting emojis into words
def convert_emojis(text):
    text = emoji.demojize(text)
    return text

In [6]:
df['Tweet'] = df['Tweet'].map(lambda com : convert_emojis(com))
df_test['Tweet'] = df_test['Tweet'].map(lambda com : convert_emojis(com))

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\W', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text) 
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.strip(' ')
    return text

In [8]:
df['Tweet'] = df['Tweet'].map(lambda com : clean_text(com))
df_test['Tweet'] = df_test['Tweet'].map(lambda com : clean_text(com))

In [9]:
df['Tweet'][0:10]

0    aliciaenp ajajjaa somos del clan twitteras perdidas pa eventos importantes                                                          
1    awadanai la mala suerte del gato fichame la cara de help me pls                                                                     
2    audiomano  mí tampoco me agrado mucho eso especialmente por tratarse de él no hay justificación                                     
3    para llevar  los bebes de un lugar  otro debemos cantarles canciones quiero cantarles gunaa nibiina la llorona en zapoteco          
4    dalasreview me encanta la terrible hipocresia  doble moral que tiene esta gente claro cuando ella te lo quita ilegalmente no importa
5    en venezuela el tráfico aéreo esta tan peligroso como el terrestre ya cayó otra avioneta                                            
6    me voy  morir sin saber por qué   me dieron block  los espantaré cada noche hasta que se arrepientan                                
7    tctelevision terrible esto   

In [10]:
train = df
test = df_test

In [11]:
X_train = train.Tweet
X_test = test.Tweet
print(X_train.shape)
print(X_test.shape)

(3559,)
(679,)


In [12]:
categories = ['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']

In [31]:
test_df_no_values = pd.read_csv('spanish.csv')
test_df_no_values.head()

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,No me pienso perder la pelea de McGregor contra Myweather,,,,,,,,,,,
1,Yo preocupada pensando que mi papá ya había llegado y me estaba llamando para decirme que el vuelo todavía no había salido 👍,,,,,,,,,,,
2,Pucha ya no me sirven todos los carros a Rondizzoni.... #ironía jajajjajaja #unañomas,,,,,,,,,,,
3,"Si estar contigo es un delito, hago cien años en prision 😃",,,,,,,,,,,
4,@macacifuentesC @sergmujica Perfect... Y de una mansa ni que terrible PLR,,,,,,,,,,,


In [32]:
SVC_pipeline = Pipeline([
               ('tfidf', TfidfVectorizer(stop_words=stop_words)),
               ('clf', CalibratedClassifierCV(OneVsRestClassifier(LinearSVC(), n_jobs=1))),
           ])

for category in categories:
    print('Emotion: {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy: {}'.format(accuracy_score(test[category], prediction)))
    y_proba = SVC_pipeline.predict_proba(X_test)[:,1]
    test_df_no_values[category] = y_proba

Emotion: anger
Test accuracy: 0.7835051546391752
Emotion: anticipation
Test accuracy: 0.865979381443299
Emotion: disgust
Test accuracy: 0.8630338733431517
Emotion: fear
Test accuracy: 0.9381443298969072
Emotion: joy
Test accuracy: 0.8114874815905744
Emotion: love
Test accuracy: 0.9528718703976435
Emotion: optimism
Test accuracy: 0.9042709867452136
Emotion: pessimism
Test accuracy: 0.8350515463917526
Emotion: sadness
Test accuracy: 0.8483063328424153
Emotion: surprise
Test accuracy: 0.9513991163475699
Emotion: trust
Test accuracy: 0.9528718703976435


In [33]:
test_df_no_values.head()

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,No me pienso perder la pelea de McGregor contra Myweather,0.19998,0.414678,0.145186,0.029909,0.40284,0.050938,0.092737,0.140734,0.072775,0.033665,0.025959
1,Yo preocupada pensando que mi papá ya había llegado y me estaba llamando para decirme que el vuelo todavía no había salido 👍,0.094834,0.266992,0.106314,0.529578,0.233677,0.010745,0.048165,0.197795,0.067138,0.035956,0.10872
2,Pucha ya no me sirven todos los carros a Rondizzoni.... #ironía jajajjajaja #unañomas,0.825208,0.057629,0.354276,0.011999,0.046281,0.019495,0.049967,0.759569,0.049031,0.044744,0.033558
3,"Si estar contigo es un delito, hago cien años en prision 😃",0.073446,0.109115,0.109282,0.008123,0.593154,0.093148,0.215158,0.109015,0.047394,0.11988,0.080803
4,@macacifuentesC @sergmujica Perfect... Y de una mansa ni que terrible PLR,0.567438,0.078598,0.456891,0.033054,0.229019,0.010924,0.031769,0.311343,0.3332,0.033312,0.016576


In [14]:
#import pickle
#model = LinearSVC()
#filename = 'spanish_model.sav'
#pickle.dump(model, open(filename, 'wb'))