##Preparing the data

In [None]:
import pandas as pd
fake_news = pd.read_csv('/content/drive/MyDrive/archive (1)/Fake.csv', usecols = ['text'])
true_news = pd.read_csv('/content/drive/MyDrive/archive (1)/True.csv', usecols = ['text'])

fake_news['Reality'] = 'Fake'
true_news['Reality'] = 'True'
news = pd.concat([fake_news, true_news], ignore_index = True) #fake_news_beta,true_news_beta
news = news.drop_duplicates(subset=['text'])
news.describe()

Unnamed: 0,text,Reality
count,38646,38646
unique,38646,2
top,Donald Trump just couldn t wish all Americans ...,True
freq,1,21191


In [None]:
true_news_beta = pd.read_csv('/content/drive/MyDrive/articles1.csv', usecols = ['content'])
true_news_beta = true_news_beta.sample(frac=1)
true_news_beta.reset_index(inplace=True,drop = True)
true_news_beta = true_news_beta[0:15000]
true_news_beta.rename(columns = {'content':'text'}, inplace = True)
true_news_beta['Reality'] = 'True'

fake_news_beta = pd.read_csv('/content/drive/MyDrive/fake.csv', usecols = ['text'])
fake_news_beta['Reality'] = 'Fake'
news_beta = pd.concat([fake_news_beta, true_news_beta], ignore_index = True)
news_beta = news_beta.drop_duplicates(subset=['text'])
news_beta.describe()

Unnamed: 0,text,Reality
count,27403,27404
unique,27403,2
top,Print They should pay all the back all the mon...,True
freq,1,14972


In [None]:
new_testing = pd.read_csv('/content/drive/MyDrive/new_testing.csv',usecols = ['text'])
new_testing['Reality'] = 'True'

In [None]:
news = pd.concat([news,news_beta,new_testing], ignore_index = True)

##Text Processing

In [None]:
import re
for b in range(len(news['text'])):
  try:
    text = news['text'][b]
    if '(Reuters) -' in text:
      text = text[text.index('-')+1:]
    text = re.sub(r' #39;',"'",text)
    text = re.sub(r'https?://\S+|www\.\S+', '.', text)
    text = re.sub(r'\\n','',text)
    text = re.sub(r'\b[^\s]*\.com[^\s]*\b','',text)
    text = re.sub(r'\b[^\s]*\.net[^\s]*\b','',text)
    text = re.sub(r'\b[^\s]*\.org[^\s]*\b','',text)
    text = re.sub(r'\b[^\s]*\.gov[^\s]*\b','',text)


    text= re.sub(r'\.','. ',text)
    text = re.sub("[^A-Za-z0-9$,%!\)\(—.;:'\"\&/ =\+-]","",text)
    text = re.sub(r' +',' ', text)
    text = text.strip()
    text = text.lower()
    news['text'][b] = text
  except:
    print(b, news['text'][b])
    news = news.drop(b)
news = news.drop_duplicates(subset=['text'])
news = news.sample(frac=1)
news.reset_index(inplace=True,drop = True)
news.describe()

43961 nan


Unnamed: 0,text,Reality
count,73536,73536
unique,73536,2
top,(cnn) after decades of a downward trend in cri...,True
freq,1,43725


##Saving the data used to train the model

In [None]:
news.to_csv('/content/drive/MyDrive/model_data_3')

##Training the model

In [None]:
from sklearn.model_selection import train_test_split
x = news['text']
y = news['Reality']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

TFIDF = TfidfVectorizer()
x_train = TFIDF.fit_transform(x_train)
x_test = TFIDF.transform(x_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(random_state=0)
RFC.fit(x_train, y_train)

##Saving the Model and TFIDF Vectorizer

In [None]:
import joblib
joblib.dump(RFC, '/content/drive/MyDrive/RFC_Model_Capstone_3.pkl')
joblib.dump(TFIDF, '/content/drive/MyDrive/tfidf_vectorizer_model_3.joblib')

['/content/drive/MyDrive/tfidf_vectorizer_model_3.joblib']

##Evaluation

*normal score*

In [None]:
#normal score

RFC.score(x_test,y_test)


0.9025700299156921

*cross validation score*

In [None]:
#cross validation score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(RFC,x_train, y_train, cv=5)
print(scores)

[0.89894612 0.89189189 0.89350671 0.90250744 0.89766256]


*random data test (data from another source)*

In [None]:
  #random data test (data from another source)
import joblib
import pandas as pd
load_model = joblib.load(open('/content/drive/MyDrive/RFC_Model_Capstone_3.pkl', 'rb'))
TFIDF = joblib.load('/content/drive/MyDrive/tfidf_vectorizer_model_3.joblib')
testing = pd.read_csv('/content/drive/MyDrive/testing.csv', usecols = ['text','label'])




import re
for b in range(len(testing['text'])):
  try:
    text = testing['text'][b]
    if '(Reuters) -' in text:
      text = text[text.index('-')+1:]
    text = re.sub(r'https?://\S+|www\.\S+', '.', text)
    text = re.sub(r'\\n','',text)
    text = re.sub(r'\b[^\s]*\.com[^\s]*\b','',text)
    text = re.sub(r'\b[^\s]*\.net[^\s]*\b','',text)
    text = re.sub(r'\b[^\s]*\.org[^\s]*\b','',text)
    text = re.sub(r'\b[^\s]*\.gov[^\s]*\b','',text)


    text= re.sub(r'\.','. ',text)
    text = re.sub("[^A-Za-z0-9$,%!\)\(—.;:'\"\&/ =\+-]","",text)
    text = re.sub(r' +',' ', text)
    text = text.strip()
    text = text.lower()
    testing['text'][b] = text
  except:
    print(b, testing['text'][b])
    testing = testing.drop(b)
testing.reset_index(inplace=True,drop = True)
print(len(testing))

for c in range(len(testing['label'])):
  if str(testing['label'][c]) == '1':
    testing['label'][c] = 'Fake'
  else:
    testing['label'][c] = 'True'


x = testing['text']
y = testing['label']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=20760, random_state=42)
print(len(x_test))

x_test = TFIDF.transform(x_test)
load_model.score(x_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


142 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


573 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


1200 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


1911 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


2148 nan
2169 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


2793 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


3329 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


3729 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


4288 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


4358 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


5717 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


6215 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


6680 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


8649 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


8908 nan
8922 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


9350 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


9446 nan
9454 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


9524 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


10466 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


10867 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


11450 nan
11486 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


12056 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


12460 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


12835 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


13020 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


13107 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


13915 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


14499 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


14933 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


16126 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


18479 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


18757 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


19157 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


19227 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


19388 nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['text'][b] = text


20761


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['label'][c] = 'Fake'


20760


0.9452312138728324

##Actual Working of the Model

In [None]:
def clean(listt):
  #works for removing links from texts, additional spaces & new lines, special unknown characters, and makes sure "a.b" turns to "a. b"
  import re
  import pandas as pd
  empty = []
  for b in listt:
    if '(Reuters) -' in b:
      b = b[b.index('-')+1:]
    text = re.sub(r' #39;',"'",b)
    text = re.sub(r'https?://\S+|www\.\S+', '.', text)
    text = re.sub(r'\\n','',text)
    text = re.sub(r'\b[^\s]*\.com[^\s]*\b','',text)
    text = re.sub(r'\b[^\s]*\.net[^\s]*\b','',text)
    text = re.sub(r'\b[^\s]*\.org[^\s]*\b','',text)
    text = re.sub(r'\b[^\s]*\.gov[^\s]*\b','',text)


    text= re.sub(r'\.','. ',text)
    text = re.sub("[^A-Za-z0-9$,%!\)\(—.;:'\"\&/ =\+-]","",text)
    text = re.sub(r' +',' ', text)
    text = text.strip()
    text = text.lower()
    text = text.strip()
    empty.append(text)
  return empty

In [None]:
import joblib
load_model = joblib.load(open('/content/drive/MyDrive/RFC_Model_Capstone_3.pkl', 'rb'))
TFIDF = joblib.load('/content/drive/MyDrive/tfidf_vectorizer_model_3.joblib')
def reality_check(text):
  import pandas as pd
  try:
    news = clean([str(text)])
    print(news)
    if len(news[0]) <= 600:
      print('Please enter a longer text.')
      raise KeyboardInterrupt
      #return 'Please enter a longer text.'
    dict = {'name': news}
    df = pd.DataFrame(dict)
    vectorized_news = TFIDF.transform(df['name'])
    return load_model.predict(vectorized_news)[0]

  except KeyboardInterrupt:
    pass
  except:
    print('Not a valid value. Please enter textual data only')

a = input('')
reality_check(a)


Israeli army spokesman Daniel Hagari earlier said that during fighting in Shejaiya district of Gaza City, troops "mistakenly identified three Israeli hostages as a threat and as a result, fired toward them and the hostages were killed". The military said that it had started "reviewing the incident" and that "immediate lessons from the event have been learned" and passed on to all troops on the ground. Meanwhile, Israeli prime minister Benjamin Netanyahu described their deaths as an “unbearable tragedy” as hundreds of people gathered outside the defence ministry in Tel Aviv to call on his government to secure the release of 129 hostages still held in the Gaza Strip.
['israeli army spokesman daniel hagari earlier said that during fighting in shejaiya district of gaza city, troops "mistakenly identified three israeli hostages as a threat and as a result, fired toward them and the hostages were killed". the military said that it had started "reviewing the incident" and that "immediate less

'True'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
re.__version__

'2.2.1'