### Importing & Loading what we need

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC  
from sklearn.metrics import accuracy_score
import pickle

# Ensure that the stopwords are downloaded
nltk.download('stopwords')

# Load the dataset
df_train = pd.read_csv('fake-news/train.csv')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [2]:
df_train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
df_train.shape

(20800, 5)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,20800.0,10399.5,6004.587135,0.0,5199.75,10399.5,15599.25,20799.0
label,20800.0,0.500625,0.500012,0.0,0.0,1.0,1.0,1.0


### Checking For Missing Values

In [6]:
df_train.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

### Handling Missing Values 

In [7]:
df_train = df_train.fillna('')

### Droping Unwanted Columns

In [8]:
df_train = df_train.drop(columns=['id','author','title'])  # Drop the 'id','author','title' column from train

### Initialize the PorterStemmer for stemming

In [9]:
port_stem = PorterStemmer()

### Define a function to clean, lowercase, and stem the text content

In [13]:
def stemming(content):
    # Remove non-alphabetic characters and convert to lowercase
    content = re.sub('[^a-zA-Z ]', '', content)
    content = content.lower()
    words = content.split()
    # Stem the words and remove stopwords
    stemmed_words = [port_stem.stem(word) for word in words if word not in stopwords.words('english')]
    return ' '.join(stemmed_words)


### Applying stemming function to the 'Content' column in the datasets

In [14]:
X = df_train['text'].apply(stemming)
Y = df_train['label']  # Separate the labels

### Print our preprocessed data samples

In [15]:
print(X.head())

0    hous dem aid didnt even see comey letter jason...
1    ever get feel life circl roundabout rather hea...
2    truth might get fire octob tension intellig an...
3    video civilian kill singl us airstrik identifi...
4    print iranian woman sentenc six year prison ir...
Name: text, dtype: object


In [16]:
Y.shape

(20800,)

In [17]:
X_train , X_test , Y_train, Y_test = train_test_split(X, Y, test_size=0.20)

### Initialize TF-IDF Vectorizer to convert text data to numerical format

In [18]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)  # Fit and transform the training data
X_test = vectorizer.transform(X_test)        # Transform the testing data

In [19]:
X_test.shape

(4160, 126666)

### Initialize & Train the SVM model

In [25]:
model = SVC()

# Train the SVM model on the training data
model.fit(X_train, Y_train)

In [26]:
# Make predictions on the test set
predictions = model.predict(X_test)

### Checking the Accuracy 

In [27]:
accuracy = model.score(X_test, Y_test)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.9519230769230769


### Saving the vectorizer and model using pickle for later use

In [28]:
pickle.dump(vectorizer, open('vector.pkl', 'wb'))
pickle.dump(model, open('model.pkl', 'wb'))

### Loading the vectorizer and model for predictions

In [29]:
vector_form = pickle.load(open('vector.pkl', 'rb'))
load_model = pickle.load(open('model.pkl', 'rb'))

### Defining a function to make predictions on new text data

In [30]:
def fake_news(news):
    news = stemming(news)  # Preprocess the input text with the stemming function
    input_data = [news]  # Wrap in a list as the model expects an array-like input
    vector_form1 = vector_form.transform(input_data)  # Transform input data to TF-IDF vector
    prediction = load_model.predict(vector_form1)  # Make prediction using the loaded model
    return prediction

### Test the fake_news function with a sample input

In [31]:
val = fake_news("""In these trying times, Jackie Mason is the Voice of Reason. 
    In this week’s exclusive clip for Breitbart News, Jackie discusses the looming threat of North Korea, 
    and explains how President Donald Trump could win the support of the Hollywood left if the U.S. needs to strike first.
    """) 

In [32]:
val_1 = fake_news("""House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) 
With apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server, the ranking Democrats on the relevant committees didn’t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. 
As we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewing emails it had recently discovered in order to see if they contained classified information. Not long after this letter went out, Oversight Committee Chairman Jason Chaffetz set the political world ablaze with this tweet. FBI Dir just informed me, "The FBI has learned of the existence of emails that appear to be pertinent to the investigation." Case reopened 
— Jason Chaffetz (@jasoninthehouse) October 28, 2016 
Of course, we now know that this was not the case . Comey was actually saying that it was reviewing the emails in light of “an unrelated case”–which we now know to be Anthony Weiner’s sexting with a teenager. But apparently such little things as facts didn’t matter to Chaffetz. The Utah Republican had already vowed to initiate a raft of investigations if Hillary wins–at least two years’ worth, and possibly an entire term’s worth of them. Apparently Chaffetz thought the FBI was already doing his work for him–resulting in a tweet that briefly roiled the nation before cooler heads realized it was a dud. 
But according to a senior House Democratic aide, misreading that letter may have been the least of Chaffetz’ sins. That aide told Shareblue that his boss and other Democrats didn’t even know about Comey’s letter at the time–and only found out when they checked Twitter. “Democratic Ranking Members on the relevant committees didn’t receive Comey’s letter until after the Republican Chairmen. In fact, the Democratic Ranking Members didn’ receive it until after the Chairman of the Oversight and Government Reform Committee, Jason Chaffetz, tweeted it out and made it public.” 
So let’s see if we’ve got this right. The FBI director tells Chaffetz and other GOP committee chairmen about a major development in a potentially politically explosive investigation, and neither Chaffetz nor his other colleagues had the courtesy to let their Democratic counterparts know about it. Instead, according to this aide, he made them find out about it on Twitter. 
There has already been talk on Daily Kos that Comey himself provided advance notice of this letter to Chaffetz and other Republicans, giving them time to turn on the spin machine. That may make for good theater, but there is nothing so far that even suggests this is the case. After all, there is nothing so far that suggests that Comey was anything other than grossly incompetent and tone-deaf. 
What it does suggest, however, is that Chaffetz is acting in a way that makes Dan Burton and Darrell Issa look like models of responsibility and bipartisanship. He didn’t even have the decency to notify ranking member Elijah Cummings about something this explosive. If that doesn’t trample on basic standards of fairness, I don’t know what does. 
Granted, it’s not likely that Chaffetz will have to answer for this. He sits in a ridiculously Republican district anchored in Provo and Orem; it has a Cook Partisan Voting Index of R+25, and gave Mitt Romney a punishing 78 percent of the vote in 2012. Moreover, the Republican House leadership has given its full support to Chaffetz’ planned fishing expedition. But that doesn’t mean we can’t turn the hot lights on him. After all, he is a textbook example of what the House has become under Republican control. And he is also the Second Worst Person in the World. About Darrell Lucus 
Darrell is a 30-something graduate of the University of North Carolina who considers himself a journalist of the old school. An attempt to turn him into a member of the religious right in college only succeeded in turning him into the religious right's worst nightmare--a charismatic Christian who is an unapologetic liberal. His desire to stand up for those who have been scared into silence only increased when he survived an abusive three-year marriage. You may know him on Daily Kos as Christian Dem in NC . Follow him on Twitter @DarrellLucus or connect with him on Facebook . Click here to buy Darrell a Mello Yello. Connect
    """) 

### Print the prediction result

In [33]:
if val[0] == 0:
    print('reliable')
else:
    print('unreliable')

reliable


In [34]:
if val_1[0] == 0:
    print('reliable')
else:
    print('unreliable')

unreliable
