# Importing all the required modules

In [1]:
library(readr)
library(dplyr)
library(stringr)
library(ggplot2)
library(tidyr)
library(tm)
library(textstem) 
library(tidytext)
library(wordcloud2)
library(pROC)
library(ROCR)
library(randomForest)  
library(naivebayes)
library(caret)
library(janeaustenr)
library(igraph)
library(ggraph)


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Loading required package: NLP


Attaching package: 'NLP'


The following object is masked from 'package:ggplot2':

    annotate


Loading required package: koRpus.lang.en

Loading required package: koRpus

Loading required package: sylly

For information on available language packages for 'koRpus', run

  available.koRpus.lang()

and see ?install.koRpus.lang()



Attaching package: 'koRpus'


The following object is masked from 'package:tm':

    readTagged


The following object is masked from 'package:readr':

    tokenize


Type 'citation("pROC")' for a citation.


Attaching package: 'pROC'


The following objects are masked from 'package:stats':

    cov, smooth, var


randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: 'randomForest'


The f

# Importing required datasets

In [None]:
fake <- read_csv('Fake.csv')
true <- read_csv('True.csv')

In [None]:
head(fake,1)
head(true,1)

# Data Decription and Data Cleaning

### The data required for this project is present in two datafiles of csv format. The files are namely true.csv and fake.csv. The true.csv files contains news that are true and similarly for Fake.csv which contains fake news.

### Number of columns

In [None]:
print("The number of columns in true.csv")
dim(true)
print("The number of columns in fake.csv")
dim(fake)

In [None]:
barplot(c(nrow(true) , nrow(fake)) , 
        main="Number of rows in each dataset",
        xlab="Category",
        ylab="Count",
        border="red",
        col="blue",
       density = 5)

## Both the datasets are balanced.

### Columns datatypes

In [None]:
glimpse(true)
glimpse(fake)

### Are any NA values present?

In [None]:
sum(is.na(true))
sum(is.na(fake))

In [None]:
## Percentage of total dataset

sum(is.na(true))/nrow(true)*100
sum(is.na(fake))/nrow(fake)*100

### As we can see that NA values are present and when compared to the total numbers of rows they are only 0.004%  and 2.68% of the total dataset. Instead of predicting them we can just drop them because large amount of data is not lost.

In [None]:
true <- true %>% drop_na()
fake <- fake %>% drop_na()

In [None]:
dim(true)
dim(fake)

### Summary of datasets

In [None]:
summary(fake)
summary(true)

# Merging datasets for further preprocessing

In [None]:
fake$y <- 0
true$y <- 1
news <- bind_rows(fake, true)

## since the y column is of categorical type and the models will consider it of numerical if we do not convert it to factor.
## The same applies for the subject columns
news$y <- as.factor(news$y)
news$subject <- as.factor(news$subject)

# Preprocessing

In [None]:
# News count by each Subject
news %>% group_by(subject) %>% count() %>% arrange(desc(n))

In [None]:
news %>%
  group_by(subject) %>%
  count(sort = TRUE) %>%
  rename(freq = n) %>%
  ggplot(aes(x = reorder(subject, -freq), y = freq)) + 
  geom_bar(stat = 'identity', fill = 'skyblue') +
  theme_classic() +
  xlab('Subject') +
  ylab('frequency') +
  geom_text(aes(label = freq), vjust = 1.2, fontface = 'bold') +
  theme(axis.title = element_text(face = 'bold', size = 15),
        axis.text = element_text(size = 13, angle = 90))

In [None]:
## Categoty wise news subject plot

ggplot(news, aes(x = subject , fill = y)) +
  geom_bar(position = 'dodge', alpha = 0.6) +
  theme_classic() +
  theme(axis.title = element_text(face = 'bold', size = 15),
        axis.text = element_text(size = 13, angle = 90))

## A news article is the combination of its heading and the text written below the heading. So we merge the title and text column into one. Reducing the dimension of the dataset. 

## We can also see that the subject column is biased in nature and will lead to decreased accuracy. So we drop that column

In [None]:
y = news$y
news$text <- paste(news$title , news$text , sep = ' ')
news <- cbind(news["text"] , y)
glimpse(news)

# N-Gram Analysis

## Uni Gram Analysis

In [None]:
tokeniztion_df <- news %>% unnest_tokens(word, text)
tokeniztion_df <- tokeniztion_df %>% anti_join(stop_words)

In [None]:
tokeniztion_df %>% count(word, sort = TRUE) %>% filter(n > 25000) %>% mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

In [None]:
#Unigram - Fake_News
tokeniztion_df_fake <- fake_news %>% unnest_tokens(word, text)
tokeniztion_df_fake <- tokeniztion_df_fake %>% anti_join(stop_words)
tokeniztion_df_fake %>% count(word, sort = TRUE) %>% filter(n > 10000) %>% mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word , fill = n)) +
  geom_col() +
  labs(y = NULL) +
  theme_minimal()

In [None]:
#Unigram - True_News
tokeniztion_df_true <- true_news %>% unnest_tokens(word, text)
tokeniztion_df_true <- tokeniztion_df_true %>% anti_join(stop_words)
tokeniztion_df_true %>% count(word, sort = TRUE) %>% filter(n > 10000) %>% mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word,fill = n)) +
  geom_col() +
  labs(y = NULL)+
  theme_minimal()

## Bi Gram Analysis

In [None]:
fake_news <- news%>%filter(y == 0)
true_news <- news%>%filter(y == 1)

In [None]:
df_bigrams <-  fake_news %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
bigrams_separated <- df_bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word)
bigram_counts <- bigrams_separated %>%  count(word1, word2, sort = TRUE)
bigram_graph <- bigram_counts %>% filter(n > 1000) %>% graph_from_data_frame()
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

In [None]:
fak_bigram <- data.frame(word <- paste(bigram_counts$word1 , bigram_counts$word2 , sep = " "))
fak_bigram$n <- bigram_counts$n
fak_bigram <- fak_bigram %>% filter(n > 1000)
names(fak_bigram) <- c("word" , "n")

In [None]:
fak_bigram %>% arrange(desc(n)) %>% mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word,fill = n)) +
  geom_col() +
  labs(y = NULL) + 
  theme_minimal()

In [None]:
tdf_bigrams <-  true_news %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
tbigrams_separated <- tdf_bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word)
tbigram_counts <- tbigrams_separated %>%  count(word1, word2, sort = TRUE)
tbigram_graph <- tbigram_counts %>% filter(n > 1000) %>% graph_from_data_frame()
set.seed(2017)
ggraph(tbigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

In [None]:
tru_bigram <- data.frame(paste(tbigram_counts$word1 , tbigram_counts$word2 , sep = " "))
tru_bigram$n <- tbigram_counts$n
ttru_bigram <- tru_bigram %>% filter(n > 1200)
names(ttru_bigram) <- c("word" , "n")

In [None]:
ttru_bigram %>% arrange(desc(n)) %>% mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word , fill = n)) +
  geom_col() +
  labs(y = NULL) + 
  theme_minimal()

## We implement various test preprocessing technique

* to lowercase
* remove numbers
* remove punctutaions
* remove stopwords 
* remove whitespaces
* Lemmatization
* Tokenization

In [None]:
library(readr)
library(dplyr)
library(stringr)
library(ggplot2)
library(tidyr)
library(tm)
library(textstem) 
library(tidytext)
library(wordcloud2)
library(pROC)
library(ROCR)
library(randomForest)  
library(naivebayes)
library(caret)
library(janeaustenr)
library(igraph)
library(ggraph)

## Importing data
fake <- read_csv('../input/fake-and-real-news-dataset/Fake.csv')
true <- read_csv('../input/fake-and-real-news-dataset/True.csv')
##Dropping NA rows
true_news <- true_news %>% drop_na()
fake_news <- fake_news %>% drop_na()
## Merging Datasets
fake_news$type <- 0
true_news$type <- 1
news <- bind_rows(fake_news, true_news)
news$type <- as.factor(news$type)
type = news$type
news$text <- paste(news$title , news$text , sep = ' ')
news <- cbind(news["text"] , type)
data <- news

In [None]:
data <- news[sample(nrow(data)),]

In [None]:
  ##Preprocessing
  doc <- VCorpus(VectorSource(data$text))
  doc <- tm_map(doc, removePunctuation)
  doc <- tm_map(doc, removeNumbers)
  doc <- tm_map(doc, content_transformer(tolower))
  doc <- tm_map(doc, removeWords, stopwords("english"))
  doc <- tm_map(doc, stripWhitespace)
  doc <- tm_map(doc, content_transformer(lemmatize_strings))

In [None]:
  ## Data Preparation
  dtm <- DocumentTermMatrix(doc)
  dtm_clean <- removeSparseTerms(dtm, sparse = 0.99)
  dtm_mat <- as.matrix(dtm_clean)
  y_prediction = data$type
  dtm_mat <- cbind(dtm_mat,y_prediction)
  dtm_df <- as.data.frame(dtm_mat)

In [None]:
 summary(dtm_df$y_prediction)

In [None]:
 dtm_df$y_prediction <- ifelse(dtm_df$y_prediction == 2, 1, 0)
 dtm_df$y_prediction <- as.factor(dtm_df$y_prediction)

In [None]:
  ## Train Test Split
  set.seed(2020)
  index <- sample(nrow(dtm_df), nrow(dtm_df)*0.8, replace = FALSE)
  train_set <- dtm_df[index,]
  test_set <- dtm_df[-index,]
  names(train_set) <- make.names(names(train_set))
  names(test_set) <- make.names(names(test_set))

In [None]:
  ## Fitting Model
  #Random Forest
  k <- round(sqrt(ncol(train_set)-1))
  clf_rf <- randomForest(formula = y_prediction ~ .,data = train_set,ntree = 5 ,mtry = k,method = 'class')
  
  #Naive Bayes
  clf_nb <- naive_bayes(y_prediction ~ ., data = train_set)

In [None]:
  ##Meta Classifier Stacking
  # Predicted values
  train_set$pred_nb <- as.factor(predict(clf_nb, type = 'class'))
  train_set$pred_rf <- as.factor(predict(clf_rf, type = 'response'))
  
  # Predicted Values for test set
  test_set$pred_nb <- as.factor(predict(clf_nb, newdata = test_set))
  test_set$pred_rf <- as.factor(predict(clf_rf, newdata = test_set, type = 'response'))
  
  #Stacking
  train_set <- train_set[c("pred_nb" , "pred_rf" , "y_prediction")]
  test_set <-  test_set[c("pred_nb" , "pred_rf" , "y_prediction")]

In [None]:
 ##Logistics Regression
  clf_lr <- glm(formula = y_prediction~.,  data = train_set, family=binomial(link="logit"))

In [None]:
  test_set$pred_lr <- predict(clf_lr, newdata = test_set, type = 'response')
  test_set$pred_lr <- ifelse(test_set$pred_lr > 0.5,1,0)
  test_set$pred_lr <- as.factor(test_set$pred_lr)

In [None]:
# Confussion Matrix
conf <- confusionMatrix(reference = test_set$y_prediction, data = test_set$pred_lr)
conf_nb <- caret::confusionMatrix(test_set$y_prediction, test_set$pred_nb)
conf_rf <- caret::confusionMatrix(test_set$y_prediction, test_set$pred_rf)

In [None]:
draw_confusion_matrix(conf)
draw_confusion_matrix(conf_nb)
draw_confusion_matrix(conf_rf)

In [None]:
draw_confusion_matrix <- function(cm) {
  
  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  t <- deparse(substitute(cm))
  title(paste0('CONFUSION MATRIX',t,sep = "  "), cex.main=2)
  
  # create the matrix 
  rect(150, 430, 240, 370, col='#3F97D0')
  text(195, 435, 'Class1', cex=1.2)
  rect(250, 430, 340, 370, col='#F7AD50')
  text(295, 435, 'Class2', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#F7AD50')
  rect(250, 305, 340, 365, col='#3F97D0')
  text(140, 400, 'Class1', cex=1.2, srt=90)
  text(140, 335, 'Class2', cex=1.2, srt=90)
  
  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')
  
  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
  
  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}