In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from langdetect import detect
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## Data Processing
**Importing Datasets**

In [2]:
#importing the datasets
data1_fake = pd.read_csv('./Data_RealFake/Dataset1/Fake1.csv')
data1_true = pd.read_csv('./Data_RealFake/Dataset1/True1.csv')

dtset1_fake = pd.DataFrame(data1_fake)
dtset1_true = pd.DataFrame(data1_true)

Changing the labels: fake = 0 and real = 1

In [3]:
dtset1_fake['label']=0
dtset1_true['label']=1

pd.DataFrame.head(dtset1_fake)
pd.DataFrame.head(dtset1_true)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [4]:
print(dtset1_fake.shape)
print(dtset1_true.shape)

(23481, 5)
(21417, 5)


Concatinating Datasets

In [5]:
dataset1 = pd.concat([dtset1_fake, dtset1_true])
pd.DataFrame.head(dataset1)

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [6]:
dataset1

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


Combining Title and Text

In [7]:
dataset1['content'] = dataset1['title']+" "+dataset1['text']

In [8]:
pd.DataFrame.head(dataset1)

Unnamed: 0,title,text,subject,date,label,content
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0,Pope Francis Just Called Out Donald Trump Dur...


Text Cleaning

In [9]:
#content = dataset1.content
#print(content)

In [10]:
port_stem = PorterStemmer()

def stemming_train(X_train):
    stemmed = re.sub('[^a-zA-Z]',' ', X_train)
    stemmed = stemmed.lower()
    stemmed = stemmed.split()
    stemmed = [port_stem.stem(word) for word in stemmed if not word in stopwords.words('english')]
    stemmed = ' '.join(stemmed)
    return stemmed

def stemming_test(X_test):
    stemmed = re.sub('[^a-zA-Z]',' ', X_test)
    stemmed = stemmed.lower()
    stemmed = stemmed.split()
    stemmed = [port_stem.stem(word) for word in stemmed if not word in stopwords.words('english')]
    stemmed = ' '.join(stemmed)
    return stemmed

dataset1['content'] = dataset1['content'].apply(stemming)
pd.DataFrame.head(dataset1)

In [11]:
#X = dataset1['content'].values

In [12]:
#TfIdf Vectorizer
#vectorizer = TfidfVectorizer()
#vectorizer.fit(X)
#X = vectorizer.transform(X)

#print(X)

**Train Test Split**

In [37]:
X = dataset1['content']
y = dataset1['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=80)

print(X_train.shape)
print(X_test.shape)


(35918,)
(8980,)


In [14]:
X_train.shape

(35918,)

In [15]:
#X_test[:20]
#X_test.value_counts()
#pd.DataFrame.head(X_test)

X_test.shape

(8980,)

In [16]:
#y_train[:20]
y_train.shape

(35918,)

In [17]:
y_test.shape

(8980,)

In [18]:
len(X_train)

35918

In [19]:
len(X_test)

8980

**Port Stemmer**

In [20]:
#Train Dataset
X_train = X_train.apply(stemming_train)

18789    proud moment america presid trump sign bill gi...
21168    l oreal sack transgend model comment white peo...
13454    cnn post truth trump poll immedi regret screen...
23136    mainstream liar want self appoint monarch trut...
4409     ex u attorney bharara take aim trump critic jo...
Name: content, dtype: object

In [38]:
#Test Dataset
X_test = X_test.apply(stemming_test)

KeyboardInterrupt: 

**CountVectorizer**

In [39]:
covector = CountVectorizer()

covector.fit(X_train)
X_train = covector.transform(X_train)

covector.fit(X_test)
X_test = covector.transform(X_test)

In [31]:
print(X_train)

  (0, 265)	1
  (0, 434)	1
  (0, 559)	1
  (0, 1143)	1
  (0, 2294)	1
  (0, 2874)	1
  (0, 3265)	1
  (0, 6689)	1
  (0, 6962)	3
  (0, 7856)	1
  (0, 8193)	1
  (0, 8536)	1
  (0, 8570)	1
  (0, 9561)	1
  (0, 9708)	1
  (0, 10414)	5
  (0, 10548)	1
  (0, 10553)	1
  (0, 12039)	1
  (0, 13349)	1
  (0, 15608)	1
  (0, 16165)	1
  (0, 16928)	1
  (0, 17531)	1
  (0, 18247)	1
  :	:
  (35917, 62653)	1
  (35917, 62787)	3
  (35917, 63270)	1
  (35917, 63280)	1
  (35917, 64121)	1
  (35917, 64129)	1
  (35917, 64589)	2
  (35917, 65040)	1
  (35917, 65242)	1
  (35917, 65334)	1
  (35917, 65839)	1
  (35917, 65883)	1
  (35917, 68744)	1
  (35917, 68812)	1
  (35917, 69471)	1
  (35917, 70222)	1
  (35917, 70598)	2
  (35917, 70981)	1
  (35917, 71133)	1
  (35917, 71761)	1
  (35917, 75547)	1
  (35917, 75975)	1
  (35917, 76587)	1
  (35917, 76707)	1
  (35917, 79805)	1


In [None]:
X_test

<8980x42187 sparse matrix of type '<class 'numpy.int64'>'
	with 1380208 stored elements in Compressed Sparse Row format>

**Logistic Regression**

In [40]:
LR = LogisticRegression()
LR.fit(X_train, y_train)
LRpred = LR.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: X has 61573 features, but LogisticRegression is expecting 111746 features as input.

In [None]:
print(LRpred)
print(y_test)

[0 1 1 ... 1 0 0]
[0 1 1 ... 1 0 0]


In [None]:
LRscore = LR.score(X_test, y_test)
print(LRscore)

0.9878619153674832


In [None]:
from sklearn import metrics
LR_r_score = metrics.confusion_matrix(y_test, LRpred)
print(LR_r_score)

[[4648   55]
 [  54 4223]]


In [None]:
LR_r_score = np.flip(LR_r_score)

#accuracy
LR_acc = (LR_r_score[0][0] + LR_r_score[-1][-1]) / np.sum(LR_r_score)
print('accuracy: '+ str(float(LR_acc)))

#precission
LR_precision = sklearn.metrics.precision_score(y_test, LRpred)
print('precision: '+ str(float(LR_precision)))

#recall
LR_recall = sklearn.metrics.recall_score(y_test, LRpred)
print('recall: '+ str(float(LR_recall)))

accuracy: 0.9878619153674832
precision: 0.9871435250116877
recall: 0.9873743277998597
