# **Coronavirus tweets-Text Classification**

#### Autore: Martina Cavallucci
#### Email: nome.cognome@studio.unibo.it
#### Release: Gennaio, 2020

*Questo script R esegue un' analisi e una classificazione di testi di tweet durante il perodo di Marzo 2019 e Aprile 2019.
Tali tweet si riferiscono ad un topic specifico: Covid-19.
L'obiettivo è quindi comprendere le relazioni tra i termini utilizzati, e la classificazione dei tweet rispetto al sentiment (Positive, Negative, Neutral).*



---

#### Import delle librerie R e di Text Mining
##### This operation take a few minutes
---




In [None]:
install.packages("lsa")
install.packages("tm")
install.packages("RWeka")
install.packages("data.table") 
install.packages("checkmate")
install.packages("stringr") 
install.packages("caret") 
install.packages("stm") 
install.packages("stminsights") 
install.packages("quanteda") 
install.packages("quanteda.textmodels")
install.packages('e1071', dependencies=TRUE)
library(data.table)
library(tidyverse)
library(tm)
library(lsa)
library(RWeka)
library(tidyverse)
if(!require("R.utils"))
  install.packages("R.utils")
library("R.utils")
library(checkmate)
library(stringr)
library(caret)
library(stm)              # For structural topic models
library(stminsights)      # For visual exploration of STM
library(quanteda)
library(quanteda.textmodels)

---

#### Import del text set su Github

---


In [3]:
download.file('https://github.com/CavallucciMartina/Coronavirus-tweets-Text-Classification/blob/main/input/Corona_NLP_test.csv.gz?raw=true', 'test.csv.gz') #, method="curl")
gunzip('test.csv.gz')
download.file('https://github.com/CavallucciMartina/Coronavirus-tweets-Text-Classification/blob/main/input/Corona_NLP_train.csv.gz?raw=true', 'train.csv.gz') #, method="curl")
gunzip('train.csv.gz')


---

#### Prima visualizzazione del train set

---


In [None]:
train <- read.csv("train.csv")
test <- read.csv("test.csv")
head(train)

---

#### Dimensioni del train set e del test set

---


In [None]:
dim(train)
dim(test)


---

#### Preparazione analisi risultati di Sentiment in Train

---

In [None]:
library(tidyverse)
districtSentiment = count(train,Sentiment)
districtSentiment[,c(2)]

---

#### Grafico delle percentuali di sentiment nel train set

---

In [None]:
sentiment <- c(districtSentiment[,c(2)]) 
lbls <- c('Positive','Negative','Neutral','Extremely Positive','Extremely Negative')
pct <- round(sentiment/sum(sentiment)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(sentiment,labels = lbls, col=rainbow(length(lbls)),
   main="Pie Chart of sentiment") 

In [None]:
y=copy(train$Sentiment )

---

#### Preparazione train per classificazione; trasformazione da 5 classi a 3: Positive, Negative, Neutral

---

In [138]:
new_train = data.frame(
                text = train$OriginalTweet,
                labels = train$Sentiment,
                stringsAsFactors=F)

new_test = data.frame(
                text = test$OriginalTweet,
                labels = test$Sentiment,
                stringsAsFactors=F)


In [139]:
# Data has 5 classes, let's convert them to 3

classes_def <- function(x)
    if (x ==  "Extremely Positive"){
         "2"
    }else if( x == "Extremely Negative"){
         "0"
    }else if(x == "Negative"){
         "0"
    }else if(x ==  "Positive"){
         "2"
    }else {
         "1"
    }


new_train$labels = lapply(new_train$labels, function(x) classes_def(x))
new_test$labels = lapply(new_test$labels, function(x) classes_def(x))
#new_train$labels.value_counts(normalize= True)

---

#### Creazione del corpus

---

In [None]:
mycorpus <- corpus(new_train)

# Assigns a unique identifier to each text
docvars(mycorpus, "Textno") <-
  sprintf("%02d", 1:ndoc(mycorpus)) 

mycorpus

In [None]:
# Save statistics in "mycorpus.stats"
mycorpus.stats <- summary(mycorpus)

# And print the statistics of the first 10 observations
head(mycorpus.stats, n = 10)

In [None]:
kwic(mycorpus, "covid", window=4)

In [None]:
kwic(mycorpus, "pandemic", window=6)

---

#### Text-preprocessing

---

In [None]:
# Preprocess the text

# Create tokens
token <-
  tokens(
    mycorpus,
    remove_numbers = TRUE,
    remove_punct = TRUE,
    remove_symbols = TRUE,
    remove_twitter = TRUE,
    remove_url = TRUE,
    remove_hyphens = TRUE,
    include_docvars = TRUE
  )

In [None]:
# Clean tokens 
token_ungd <- tokens_select(
  token,
  c("[\\d-]", "(http|https)://([^\\s]+)", "<.*?>","#\\w+","@\\w+","\\d+","\\s+"),
  selection = "remove",
  valuetype = "regex",
  verbose = TRUE
)
toks_nostop <- tokens_select(token_ungd, pattern = stopwords("en"), selection = "remove")
print(toks_nostop)


---

#### Creazione matrice documenti-termini

---

In [None]:
doc_term_matrix <- dfm(token_ungd,
                       tolower = TRUE,
                       stem = FALSE,
                       remove = stopwords("english"))

doc_term_matrix <- dfm_tfidf(doc_term_matrix)
doc_term_matrix

In [None]:
doc_term_matrix.trim <-doc_term_matrix
doc_term_matrix.trim

In [None]:
# And print the results of the first 10 observations and first 10 features in a DFM
head(dfm_sort(doc_term_matrix.trim, decreasing = TRUE, margin = "both"),
     n = 10,
     nf = 10) 

---

#### Generazione wordcloud

---

In [None]:
set.seed(100)
textplot_wordcloud(doc_term_matrix.trim, min_count = 300, random_order = FALSE,
                   rotation = .25, 
                   color = RColorBrewer::brewer.pal(8,"Dark2"))

In [None]:
dict <- featnames(doc_term_matrix)
dict

---

## **Classificazione**

---

Supervised machine learning - Naive Bayes (NB) 

In [None]:
#Cleaning test set
test_corpus <- corpus(new_test)
token <-
  tokens(
    test_corpus,
    remove_numbers = TRUE,
    remove_punct = TRUE,
    remove_symbols = TRUE,
    remove_twitter = TRUE,
    remove_url = TRUE,
    remove_hyphens = TRUE,
    include_docvars = TRUE
  )
# Clean tokens 
token_ungd <- tokens_select(
  token,
  c("[\\d-]", "(http|https)://([^\\s]+)", "<.*?>","#\\w+","@\\w+","\\d+","\\s+"),
  selection = "remove",
  valuetype = "regex",
  verbose = TRUE
)
toks_nostop <- tokens_select(token_ungd, pattern = stopwords("en"), selection = "remove")
dfmat_test <- dfm(token_ungd,
                       tolower = TRUE,
                       stem = FALSE,
                       remove = stopwords("english"))
dfmat_test <- dfm_tfidf(dfmat_test)
dfmat_test

In [None]:
# Train naive Bayes
# The function takes a DFM as the first argument 
tmod_nb <- textmodel_nb(doc_term_matrix.trim, unlist(docvars(doc_term_matrix.trim, "labels")))
summary(tmod_nb)
# The prior indicates an assumed distribution. 
# Here we choose how frequently the categories occur in our data


In [None]:
summary(tmod_nb)

In [None]:
dfmat_matched <- dfm_match(dfmat_test, features = featnames(doc_term_matrix.trim))

In [149]:
actual_class <- unlist(dfmat_matched$labels)
predicted_class <- predict(tmod_nb, newdata = dfmat_matched,force = TRUE)
tab_class <- table(actual_class, predicted_class)

In [None]:
confusionMatrix(tab_class, mode = "everything")

Linear SVM classifier

In [None]:
tmod <- textmodel_svm(doc_term_matrix.trim, unlist(quanteda::docvars(doc_term_matrix.trim, "labels")))

In [None]:
dfmat_matched <- dfm_match(dfmat_test, features = featnames(doc_term_matrix.trim))
actual_class <- unlist(dfmat_matched$labels)
predicted_class <- predict(tmod, newdata = dfmat_matched,force = TRUE)
tab_class2 <- table(actual_class, predicted_class)
tab_class2

In [None]:
confusionMatrix(tab_class2, mode = "everything")

Random forest Classifier

TODO 
- Classificatore Random Forest
- Classificatore K-NN
- Classificatore Rocchio
- CLassificatore Multinomial Bayesian (sempre con nb)
- LSTM bidirectional


Provare anche classificazione con bigram