## Détermination des sentiments
 Inspiré et adapté de l'exemple suivant : http://dimension.usherbrooke.ca/dimension/exempleTMSentimentsFB.html

In [2]:
#Importation des packages pour le traitement des données
library(quanteda)
library(preText)
library(RCurl)
library(jsonlite)
library(tm)

#Fonction pour enlever certaines parties de mots spécifiques au français
enleve_mots=function(tk){
  tk=gsub("l'", "",tk)
  tk=gsub("d'", "",tk)
  tk=gsub("qu'", "", tk)
  tk=gsub("j'", "", tk)
  return(tk)
}


Package version: 1.3.0
Parallel computing: 2 of 4 threads used.
See https://quanteda.io for tutorials and examples.

Attaching package: ‘quanteda’

The following object is masked from ‘jupyter:irkernel’:

    View

The following object is masked from ‘package:utils’:

    View

preText: Diagnostics to Assess the Effects of Text Preprocessing Decisions
Version 0.6.3 created on 2018-01-12.
copyright (c) 2017, Matthew J. Denny, Penn State University
                    Arthur Spirling, NYU
Type vignette('getting_started_with_preText') to get started.
Development website: https://github.com/matthewjdenny/preText
Loading required package: bitops
Loading required package: NLP

Attaching package: ‘tm’

The following objects are masked from ‘package:quanteda’:

    as.DocumentTermMatrix, stopwords



In [3]:
#Chargement des commentaires
fichier="/home/user/Documents/UASB03/comment_fr.json"
Comment=fromJSON(fichier)
sapply(Comment,class)
nrow(Comment)


In [4]:
# Suppression des lignes où la probabilté que ce soit du français est inférieure à 80%
Comment <- Comment[Comment$lg_proba >=0.8,]
nrow(Comment)

In [5]:
# Chargement du  dictionnaire des sentiments
dictfile <- tempfile()
dic_ton <- dictionary(file ="AirBnBSentiment.cat", sep = "/")
# Chargement des stopwords
new_stopwords <- read.csv("AirBnBStopwords-fr.txt", header = FALSE)
list_stopwords <- as.character(new_stopwords$V1)
stopwords <- c(list_stopwords)

In [6]:
#Création du corpus 
monCorpus <- corpus((char_tolower(as.character(Comment$comments))))
#Détour inélégant pour éliminer certaines particularités propre à la langue française  du type: l' d' (liste incomplète)
#création d'une matrice provisoire
mm=matrix(nrow=dim(Comment)[1], ncol=1)
for(j in 1:dim(Comment)[1]){
  mm[j,1]=enleve_mots(monCorpus[j])
}
monCorpus=corpus(mm)

#Nettoyage
monCorpus=tokens(monCorpus, remove_numbers = TRUE, remove_punct = TRUE,remove_separators = TRUE,remove_hyphens = TRUE)
monCorpus=tokens_remove(monCorpus, c(stopwords))

#Application du dictionnaire des sentiments: obtient l'occurence des termes positifs et des termes négatifs par commentaire
analyseSentiments <- dfm(monCorpus, dictionary = dic_ton)


In [7]:
textstat_frequency(analyseSentiments)

feature,frequency,rank,docfreq,group
POSITIVE,965353,1,194798,all
NEGATIVE,83240,2,50981,all


In [8]:
# Transformation en dataframe avec ajout du nombre de mots "conservé"
BaseTemp=data.frame(analyseSentiments,nb=ntoken(monCorpus))
# Constitution du tag POSITIF/NEGATIF/NEUTRE en fonction des autres indicateurs


BaseTemp$Negatif=(BaseTemp$NEGATIVE>BaseTemp$POSITIVE |(BaseTemp$NEGATIVE/BaseTemp$POSITIVE>0.5 & BaseTemp$NEGATIVE >2 &BaseTemp$NEGATIVE<BaseTemp$POSITIVE))
BaseTemp$Neutre = (BaseTemp$POSITIVE==BaseTemp$NEGATIVE)
BaseTemp$Positif= !(BaseTemp$Negatif|BaseTemp$Neutre)
BaseTemp$Flag_positif <- ifelse(BaseTemp$Negatif==TRUE,0,ifelse(BaseTemp$Positif==TRUE,1,2))

“'as.data.frame.dfm' is deprecated.
Use 'convert(x, to "data.frame")' instead.
See help("Deprecated")”

In [9]:
BaseTemp

Unnamed: 0,document,NEGATIVE,POSITIVE,nb,Negatif,Neutre,Positif,Flag_positif
text1,text1,0,4,17,FALSE,FALSE,TRUE,1
text2,text2,1,6,21,FALSE,FALSE,TRUE,1
text3,text3,0,5,17,FALSE,FALSE,TRUE,1
text4,text4,3,13,89,FALSE,FALSE,TRUE,1
text5,text5,1,8,38,FALSE,FALSE,TRUE,1
text6,text6,0,5,23,FALSE,FALSE,TRUE,1
text7,text7,0,0,1,FALSE,TRUE,FALSE,2
text8,text8,0,4,11,FALSE,FALSE,TRUE,1
text9,text9,1,1,18,FALSE,TRUE,FALSE,2
text10,text10,0,4,22,FALSE,FALSE,TRUE,1


In [10]:
# Constitution du dataframe final en mergeant les données initiales e tles calculs
result=data.frame(Comment,BaseTemp)
result$document <- NULL

In [11]:
summary(result)

      X_id               date            reviewer_id        reviewer_name     
 Min.   :    31876   Length:197372      Min.   :      251   Length:197372     
 1st Qu.: 48693856   Class :character   1st Qu.: 13638765   Class :character  
 Median : 77819584   Mode  :character   Median : 29655692   Mode  :character  
 Mean   : 79190377                      Mean   : 37698377                     
 3rd Qu.:115283679                      3rd Qu.: 53735180                     
 Max.   :141885349                      Max.   :123929885                     
                                                                              
   comments            langue             lg_proba      review_scores_accuracy
 Length:197372      Length:197372      Min.   :0.8000   Min.   : 2.000        
 Class :character   Class :character   1st Qu.:0.9690   1st Qu.: 9.000        
 Mode  :character   Mode  :character   Median :0.9825   Median :10.000        
                                       Mean   :0.971

In [12]:
# Export des lignes globalement négatives pour analyse
write.csv2(result[result$Negatif,], file="comment_fr_sentiment_NEG.csv", quote=TRUE,  fileEncoding= "utf-8" )

In [13]:
# Export des données complètes
write.csv2(result, file="comment_fr_sentiment.csv", quote=TRUE, fileEncoding= "utf-8" )


In [14]:
# Export des données complètes
result_modele = result[result$Flag_positif!=2,c("X_id","comments","Flag_positif")]
names(result_modele)[names(result_modele) == 'X_id'] <- 'id'
write.table(result_modele, file="comment_fr_modele.csv", quote=TRUE, sep="#", fileEncoding= "utf-8" ,
            row.names = FALSE,col.names=FALSE)


In [20]:
summary(result_modele)

       id              comments          Flag_positif   
 Min.   :    31876   Length:192744      Min.   :0.0000  
 1st Qu.: 48374538   Class :character   1st Qu.:1.0000  
 Median : 77440254   Mode  :character   Median :1.0000  
 Mean   : 78922441                      Mean   :0.9794  
 3rd Qu.:115015478                      3rd Qu.:1.0000  
 Max.   :141885349                      Max.   :1.0000  