In [1]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('french'))
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
import seaborn as sns

In [2]:
df = pd.read_csv('french_lexicon.csv')

In [3]:
df.head()

Unnamed: 0,id,word,polarity,joy,fear,sadness,anger,surprise,disgust
0,1,à ce endroit là,positive,0,0,0,0,0,0
1,2,à le hâte,negative,0,1,0,0,1,0
2,3,à part,negative,0,0,1,0,0,0
3,4,à pic,negative,0,1,0,0,0,0
4,5,à rallonge,negative,0,0,1,0,0,0


In [4]:
df.drop(['polarity'], axis = 1, inplace = True)

In [5]:
df_new = df.drop(['id', 'word'], axis=1)
counts = []
categories = list(df_new.columns.values)
for i in categories:
    counts.append((i, df[i].sum()))
df_stats = pd.DataFrame(counts, columns=['Emotion', 'no. sentences'])
df_stats

Unnamed: 0,Emotion,no. sentences
0,joy,521
1,fear,3199
2,sadness,2512
3,anger,2103
4,surprise,1182
5,disgust,2013


In [6]:
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(df['word']).head()

Unnamed: 0,word
0,à ce endroit là
1,à le hâte
2,à part
3,à pic
4,à rallonge


In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\W', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text) 
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"\à", "", text)
    text = text.strip(' ')
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    return text

In [8]:
df['word'] = df['word'].map(lambda com : clean_text(com))

In [9]:
df['word'][0:10]

0    ce endroit 
1    le hâte    
2    part       
3    pic        
4    rallonge   
5    abasourdir 
6    ablation   
7    abominable 
8    abrupt     
9    absent     
Name: word, dtype: object

In [10]:
train, test = train_test_split(df, test_size=0.20, random_state=1, shuffle=True)

In [11]:
X_train = train.word
X_test = test.word
print(X_train.shape)
print(X_test.shape)

(11301,)
(2826,)


In [12]:
categories = ['joy', 'fear', 'sadness','anger', 'surprise', 'disgust']

In [13]:
# Define a pipeline combining a text feature extractor with multi lable classifier
SVC_pipeline = Pipeline([
               ('tfidf', TfidfVectorizer(stop_words=stop_words, ngram_range = (1, 2))),
               ('clf', CalibratedClassifierCV(OneVsRestClassifier(LinearSVC(), n_jobs=1))),
           ])

for category in categories:
    print('Emotion: {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print(' {}'.format(accuracy_score(test[category], prediction)))

Emotion: joy
 0.9670912951167728
Emotion: fear
 0.7745930644019816
Emotion: sadness
 0.8152866242038217
Emotion: anger
 0.8627034677990092
Emotion: surprise
 0.9150743099787686
Emotion: disgust
 0.8563340410474168


In [14]:
#import pickle
#model = LinearSVC()
#filename = 'french_model.sav'
#pickle.dump(model, open(filename, 'wb'))

In [15]:
df_unseen = pd.read_csv('French_unseen.csv', encoding='latin-1')
df_unseen['word'] = df_unseen['word'].map(lambda com : clean_text(com))
df_unseen.head()

Unnamed: 0,English,word,joy,fear,sadness,anger,surprise,disgust
0,I am very happy,je suis très heureux,,,,,,
1,I'm really upset with you,je suis vraiment énervé contre toi,,,,,,
2,I'm so frustrated with how it turned out,je suis tellement frustré de la façon dont cela est avéré,,,,,,
3,"I love my daughter so much, she is the light of my life",aime tellement ma fille elle est la lumière de ma vie,,,,,,
4,"I can't believe my car broke down, it's the worst!",je ne peux pas croire que ma voiture est tombée en panne est la pire,,,,,,


In [16]:
X_unseen = df_unseen.word

In [17]:
for category in categories:
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    y_proba = SVC_pipeline.predict_proba(X_unseen)[:,1]
    df_unseen[category] = y_proba

In [18]:
df_unseen[0:10]

Unnamed: 0,English,word,joy,fear,sadness,anger,surprise,disgust
0,I am very happy,je suis très heureux,0.060627,0.133283,0.126832,0.110711,0.088339,0.078149
1,I'm really upset with you,je suis vraiment énervé contre toi,0.032433,0.444769,0.228042,0.551586,0.062921,0.217069
2,I'm so frustrated with how it turned out,je suis tellement frustré de la façon dont cela est avéré,0.031746,0.11729,0.096338,0.094265,0.059296,0.093165
3,"I love my daughter so much, she is the light of my life",aime tellement ma fille elle est la lumière de ma vie,0.206529,0.175877,0.110508,0.163776,0.043002,0.12862
4,"I can't believe my car broke down, it's the worst!",je ne peux pas croire que ma voiture est tombée en panne est la pire,0.03002,0.497187,0.519275,0.078834,0.052405,0.2099
5,I hate croissants so much!,je déteste tellement les croissants,0.036292,0.22041,0.168335,0.146812,0.07958,0.141054
6,"The flowers in the garden are so beautiful, every time I see them I smile",les fleurs du jardin sont si belles chaque fois que je les vois je souris,0.081963,0.255728,0.079869,0.077828,0.051434,0.0788
7,I am so worried about my interview tomorrow!,je suis tellement inquiet pour mon entretien de demain,0.032059,0.140489,0.112195,0.078139,0.062863,0.105776
8,"I think sea urchins are so disgusting, yuck!",je pense que les oursins sont si dégoûtants beurk,0.033087,0.147342,0.117637,0.110972,0.065176,0.108782
9,I look forward to the day when Donald Trump is charged!,attends avec impatience le jour où donald trump sera mis en accusation,0.029196,0.137689,0.074788,0.623556,0.066499,0.309654


In [19]:
## as expected, due to the limited nature of the dataset generalizability is poor