In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("train.csv", sep='\t', encoding='utf-8')
test = pd.read_csv("test.csv", sep='\t', encoding='utf-8')

In [3]:
train.head()

Unnamed: 0,text,label
0,Get the latest from TODAY Sign up for our news...,1
1,2d Conan On The Funeral Trump Will Be Invited...,1
2,It’s safe to say that Instagram Stories has fa...,0
3,Much like a certain Amazon goddess with a lass...,0
4,At a time when the perfect outfit is just one ...,0


In [4]:
record = "WASHINGTON—In an effort to aid unhoused service members, the Department of Veteran Affairs announced Monday a new initiative to help get homeless veterans into bigger tents. “It’s frankly disgusting that these men and women who nobly served their country are consigned to sleeping in a cramped pop-up where they barely have room to stretch out, and we are committed to doing something about it,” said VA secretary Denis McDonough, telling reporters that the initiative would aim to provide former military personnel a high-quality 3-person tent with a screened-in porch area to host guests. “Our goal is to ensure that within the next five years, every hero who is sleeping on the ground can move into a luxurious REI tent containing multiple vestibules for clothing and personal possessions. And for those who need it, we will also provide a complimentary steel stake for them to fight off potential intruders.” McDonough added that this was only a temporary solution and the agency’s ultimate goal was to place all the homeless veterans in a giant circus tent where we never have to think about them again."
input_df= pd.DataFrame([[record]], columns=['text'])
input_df

Unnamed: 0,text
0,WASHINGTON—In an effort to aid unhoused servic...


In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Derek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Derek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
train['text'].loc[0]

'Get the latest from TODAY Sign up for our newsletter  No one ever truly gets over losing a loved one, and Blake Shelton is no exception. He was just 14 when his older brother Richie died on Nov. 13, 1990. And, as Shelton noted in a tweet Monday, "It changed my life forever."  Richie was 24 when he died in a car accident in the Sheltons\' home state of Oklahoma. Two years ago, Shelton sent out a message for the 25th anniversary of his loss:  Richie, who was Blake\'s half-brother (they shared a mother), was a passenger in a car that collided with a school bus in Ada, south of Oklahoma City.  Richie, driver Redena McManus and a 3-year-old boy, Christopher McManus, all died during or shortly after the collision, while the bus driver and passengers were uninjured, according to police reports.  The accident has clearly remained with Blake, who told 60 Minutes in 2014, "I remember picking up the phone to call him a week after he was dead, to tell him something. I was picking up the phone to 

In [7]:
train.text.loc[ : 5].values.tolist()[0]

'Get the latest from TODAY Sign up for our newsletter  No one ever truly gets over losing a loved one, and Blake Shelton is no exception. He was just 14 when his older brother Richie died on Nov. 13, 1990. And, as Shelton noted in a tweet Monday, "It changed my life forever."  Richie was 24 when he died in a car accident in the Sheltons\' home state of Oklahoma. Two years ago, Shelton sent out a message for the 25th anniversary of his loss:  Richie, who was Blake\'s half-brother (they shared a mother), was a passenger in a car that collided with a school bus in Ada, south of Oklahoma City.  Richie, driver Redena McManus and a 3-year-old boy, Christopher McManus, all died during or shortly after the collision, while the bus driver and passengers were uninjured, according to police reports.  The accident has clearly remained with Blake, who told 60 Minutes in 2014, "I remember picking up the phone to call him a week after he was dead, to tell him something. I was picking up the phone to 

In [8]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
#data pre processing
def preprocess_data(data):
    #tokenization
    token = RegexpTokenizer('\s+', gaps = True)
    text_data = []
    for values in data.text:
        tokenized_data = token.tokenize(values)
        text_data.append(tokenized_data)

    #stopwords removal
    stp = stopwords.words('english')
    clean_data = []
    for data in text_data:
        clean_text = [words.lower() for words in data if words.lower() not in stp]
        clean_data.append(clean_text)

    #stemming
    ps = PorterStemmer()
    stemmed_data = [] 
    for data in clean_data:
        stemmed_text = [ps.stem(words) for words in data] 
        stemmed_data.append(stemmed_text)

    #Flatten data
    updated_data = []
    for data in stemmed_data:
        updated_data.append(" ".join(data))

    #TF-IDF is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.    
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(updated_data)

    return tfidf_matrix

In [10]:
train_len = train.shape[0]
merged_data = pd.concat((train.drop('label', axis = 1), test.drop('id', axis = 1)), axis = 0).reset_index().drop('index', axis = 1)

In [11]:
merged_data.head()

Unnamed: 0,text
0,Get the latest from TODAY Sign up for our news...
1,2d Conan On The Funeral Trump Will Be Invited...
2,It’s safe to say that Instagram Stories has fa...
3,Much like a certain Amazon goddess with a lass...
4,At a time when the perfect outfit is just one ...


In [13]:
preprocessed_data = preprocess_data(train)
preprocessed_input = preprocess_data(input_df)

In [14]:
train_data = preprocessed_data[ : train_len]
test_data = preprocessed_data[train_len : ]

In [17]:
print(preprocessed_data)

  (0, 56077)	0.03624641981386925
  (0, 15059)	0.07397788072487518
  (0, 44346)	0.09522859479450836
  (0, 21464)	0.0312668638828439
  (0, 7560)	0.035086173399058694
  (0, 25275)	0.03163585666694048
  (0, 13467)	0.08822227963722613
  (0, 54130)	0.027890922173694477
  (0, 25711)	0.0602890028021506
  (0, 24497)	0.0880088246813838
  (0, 34004)	0.03693086948774641
  (0, 23514)	0.06651358401424955
  (0, 49810)	0.047949216175958986
  (0, 59904)	0.026595919262643153
  (0, 44642)	0.06209232035847989
  (0, 42390)	0.049801964386416396
  (0, 59086)	0.053432260420311815
  (0, 29767)	0.05541204179182115
  (0, 48406)	0.1043807791866198
  (0, 8154)	0.06220010716958591
  (0, 49992)	0.10732712447203009
  (0, 33482)	0.11112562419507087
  (0, 58763)	0.10732712447203009
  (0, 29606)	0.05547692551469955
  (0, 59411)	0.0602890028021506
  :	:
  (4986, 10823)	0.02254408440314144
  (4986, 49717)	0.010229796304873213
  (4986, 47749)	0.03621645649562389
  (4986, 28449)	0.01839056341042526
  (4986, 21464)	0.0107727

In [12]:
# Model selection
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(train_data, train.label, test_size=0.2, random_state = 42)

In [13]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Accuracy = logreg.score(X_test, y_test)

print(Accuracy*100)

# Naive-Bayes

from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()
NB.fit(X_train, y_train)
Accuracy = NB.score(X_test, y_test)

print(Accuracy*100)

# Decision Tree

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
Accuracy = clf.score(X_test, y_test)

print(Accuracy*100)

# Passive-Aggressive Classifier

from sklearn.metrics import accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier

pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=pac.predict(X_test)
score=accuracy_score(y_test,y_pred)

print(f'Accuracy: {round(score*100,2)}%')

76.05210420841684
66.33266533066133
67.63527054108216
Accuracy: 76.65%


In [14]:
import joblib

In [15]:
filename = "../Fake-News-Detection/Data_Gathering/Resources/test_model_save.sav"

In [16]:
joblib.dump(pac, open(filename, 'wb'))

In [17]:
loaded_model = joblib.load(filename)

In [18]:
print(loaded_model)

PassiveAggressiveClassifier(max_iter=50)


In [19]:
result = loaded_model.score(X_test, y_test)

In [20]:
print(result)

0.7665330661322646


In [21]:
print(X_test)

  (0, 830)	0.06222595423416798
  (0, 835)	0.05404393514023602
  (0, 1607)	0.08852485923137172
  (0, 3255)	0.16059294089449402
  (0, 4090)	0.1601998333014327
  (0, 4275)	0.06253196214063829
  (0, 4424)	0.0682795610590539
  (0, 4495)	0.058098817414791185
  (0, 5252)	0.09618948153833608
  (0, 5772)	0.07845905458718633
  (0, 6505)	0.0959891045565585
  (0, 6534)	0.06584992079230216
  (0, 8678)	0.10026664919191736
  (0, 10070)	0.0938063766089574
  (0, 14669)	0.08074174435410884
  (0, 15173)	0.07468735175644366
  (0, 16196)	0.0814544060460562
  (0, 16643)	0.16059294089449402
  (0, 16805)	0.07634874372906783
  (0, 16897)	0.09874711074904904
  (0, 17569)	0.07651367190596457
  (0, 18510)	0.12812603080903007
  (0, 18931)	0.12729040313426682
  (0, 20749)	0.07275443824566562
  (0, 22880)	0.06017492455034196
  :	:
  (997, 57661)	0.04232013511302595
  (997, 57788)	0.07935885579440419
  (997, 57808)	0.03587385458180039
  (997, 59291)	0.04021861438515271
  (997, 59574)	0.02939572919824176
  (997, 59915

In [22]:
print(y_test)

3752    0
1536    1
1662    1
1075    1
4200    1
       ..
4289    0
4028    0
4903    0
191     1
4864    0
Name: label, Length: 998, dtype: object


In [None]:
# Load the model
from tensorflow.keras.models import load_model
base_model = load_model("..\Fake-News-Detection\Data_Gathering\Resources\data.h5")

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Derek\anaconda3\envs\PythonAdv\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-f736bb5f2141>", line 3, in <module>
    base_model = load_model("..\Fake-News-Detection\Data_Gathering\Resources\data.h5")
  File "C:\Users\Derek\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow_core\python\keras\saving\save.py", line 146, in load_model
    return hdf5_format.load_model_from_hdf5(filepath, custom_objects, compile)
  File "C:\Users\Derek\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow_core\python\keras\saving\hdf5_format.py", line 165, in load_model_from_hdf5
    raise ValueError('No model found in config file.')
ValueError: No model found in config file.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Derek\anaconda3\envs\PythonAdv\lib\site-packages\IP