In [108]:
import numpy as np
import pandas as pd

In [109]:
#Read the data
df=pd.read_csv('D:\ML_Projects\Fake_News_Detection\\news.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [110]:
df=df.drop(df.columns[0], axis=1)
df=df.dropna()

In [111]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [112]:
#We are taking only 1st 1000 titles, since this dataset contains a lot of records.
##df=df[0:1000]

In [113]:
#We are storing independent variable in x.
#Extracting all records(left of ,)
#Extracting title column and text column (not label column)(right of ,) and storing it in x. 
x = df.iloc[:,:-1].values 

#We are storing dependent variable in y.
#Extracting all records(left of ,)
#Extracting only label column(right of ,) and storing it in y. 
y = df.iloc[:,-1].values

In [114]:
x[0] #title and text(independent variables) of 1st row

array(['You Can Smell Hillary’s Fear',
      dtype=object)

In [115]:
y[0]

'FAKE'

In [116]:
#To build ML model,convert text data into numerical values using CountVectorizer.
#We are importing CountVectorizer from feature_extraction.

from sklearn.feature_extraction.text import CountVectorizer

In [117]:
#We will set maximum no. of features to be 5000.
#Maximum no. of features denote the no. of records which we want to vectorize.
#Since we only have 1000 records and we have given the maximum limit of 5000, it will vectorize all the text data which we have.

cv = CountVectorizer(max_features=5000)

#Inside cv.fit_transform, we are passing all records of text column of independent variable x.
#We are converting this text into numerical data.
#Then, we are converting it to a dense matrix.

mat_text = cv.fit_transform(x[:,1]).todense()

In [118]:
mat_text

#The matrix contains the count of words. The CountVectorizer has a dictionay with it.
#There is a word-builder , which tags a number with all the words. It tells how many times each word occurs.
#For entire dictionary, we have 0's and 1's. 
#Wherever we have 1, it means that we have that particular word existing in the text.

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 1, 0, ..., 1, 0, 0],
        [0, 1, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [119]:
cv_title = CountVectorizer(max_features=5000)

#Inside cv_title.fit_transform, we are passing all records of title column of independent variable x.
#We are converting the text into numerical data.
#Then, we are converting it to a dense matrix.


mat_title = cv_title.fit_transform(x[:,0]).todense()

In [120]:
mat_title

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [121]:
# Now, using np.hstack() we have to stack title on top of text, as this would be our independent variable.
X_mat = np.hstack((mat_title, mat_text))

In [122]:
#We divide our entire dataset into training and testing sets. For this, we'll use train_test_split method.
#We'll import train_test_split from sklearn.model_selection

from sklearn.model_selection import train_test_split

#"X_mat" (mat_title and mat_text) is our independent variable and "y" (label) is our dependent variable.
#X_train is the training set of all the independent variables.
#X_test is the testing set of all the independent variables.
#Y_train is the training set of all the dependent variables.
#Y_test is the testing set of all the dependent variables.
#test_size=0.2 means that 20% of the records would be in the test set and the rest of the 80% of the records would be in the training set.
X_train, X_test, Y_train, Y_test = train_test_split(X_mat,y, test_size=0.2, random_state=0)

In [123]:
#Now,that we have the training set and testing set ready, we can finally go ahead and build the DecisionTreeClassifier.
#We'll import DecisionTreeClassifier from sklearn.tree

from sklearn.tree import DecisionTreeClassifier

#Create an instance/object of DecisionTreeClassifier named dtc
dtc = DecisionTreeClassifier(criterion='entropy')

#Fit the model on top of the training set
dtc.fit(X_train,Y_train)

#Predict the values on top of the test set
y_pred = dtc.predict(X_test)
y_pred

array(['REAL', 'FAKE', 'FAKE', ..., 'FAKE', 'REAL', 'REAL'], dtype=object)

In [124]:
#Build a confusion matrix
#The left diagonal will give all the correctly predicted results
#The right diagonal will give all the incorrectly predicted results
from sklearn.metrics import confusion_matrix,accuracy_score
confusion_matrix(Y_test,y_pred)

array([[505, 110],
       [ 99, 553]], dtype=int64)

In [125]:
score=accuracy_score(Y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 83.5%
