### Importing & Loading what we need

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC  
from sklearn.metrics import accuracy_score
import pickle

# Ensure that the stopwords are downloaded
nltk.download('stopwords')

# Load the dataset
df_train = pd.read_csv('fake-news/train.csv')
df_test = pd.read_csv('fake-news/test.csv')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [2]:
df_train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
df_test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [4]:
print(f'Training set shape: {df_train.shape}')
print(f'Test set shape: {df_test.shape}')

Training set shape: (20800, 5)
Test set shape: (5200, 4)


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [7]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,20800.0,10399.5,6004.587135,0.0,5199.75,10399.5,15599.25,20799.0
label,20800.0,0.500625,0.500012,0.0,0.0,1.0,1.0,1.0


In [8]:
df_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,5200.0,23399.5,1501.255031,20800.0,22099.75,23399.5,24699.25,25999.0


### Checking For Missing Values

In [9]:
df_train.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [10]:
df_test.isna().sum()

id          0
title     122
author    503
text        7
dtype: int64

### Handling Missing Values 

In [11]:
df_train.fillna('', inplace=True)
df_test.fillna('', inplace=True)

### Droping Unwanted Columns & Combine 'author' and 'title' into 'Content'

In [12]:
df_train['Content'] = df_train['author'] + ' ' + df_train['title']
df_test['Content'] = df_test['author'] + ' ' + df_test['title']
df_train = df_train.drop(columns=['id'], axis=1)  # Drop the 'id' column from train
df_test = df_test.drop(columns=['id'], axis=1)  # Drop the 'id' column from test

### Initialize the PorterStemmer for stemming

In [13]:
port_stem = PorterStemmer()

### Define a function to clean, lowercase, and stem the text content

In [14]:
def stemming(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z ]', '', text)
    text = text.lower()
    words = text.split()
    # Stem the words and remove stopwords
    stemmed_words = [port_stem.stem(word) for word in words if word not in stopwords.words('english')]
    return ' '.join(stemmed_words)


### Applying stemming function to the 'Content' column in the datasets

In [15]:
X_train = df_train['Content'].apply(stemming)
Y_train = df_train['label']  # Separate the labels
X_test = df_test['Content'].apply(stemming)


### Print our preprocessed data samples

In [16]:
print(X_train.head())
print(X_test.head())

0    darrel lucu hous dem aid didnt even see comey ...
1    daniel j flynn flynn hillari clinton big woman...
2               consortiumnewscom truth might get fire
3    jessica purkiss civilian kill singl us airstri...
4    howard portnoy iranian woman jail fiction unpu...
Name: Content, dtype: object
0    david streitfeld specter trump loosen tongu pu...
1    russian warship readi strike terrorist near al...
2    common dream nodapl nativ american leader vow ...
3    daniel victor tim tebow attempt anoth comeback...
4     truth broadcast network keiser report meme war e
Name: Content, dtype: object


### Initialize TF-IDF Vectorizer to convert text data to numerical format

In [17]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)  # Fit and transform the training data
X_test = vectorizer.transform(X_test)        # Transform the testing data

### Initialize & Train the SVM model

In [18]:
model = SVC()

# Train the SVM model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

### Checking the accuracy of the model on the training data

In [19]:
accuracy = model.score(X_train, Y_train)
print("Model Accuracy on training data:", accuracy)

Model Accuracy on training data: 0.9990865384615385


### Saving the vectorizer and model using pickle for later use

In [20]:
pickle.dump(vectorizer, open('vector.pkl', 'wb'))
pickle.dump(model, open('model.pkl', 'wb'))

### Loading the vectorizer and model for predictions

In [21]:
vector_form = pickle.load(open('vector.pkl', 'rb'))
load_model = pickle.load(open('model.pkl', 'rb'))

### Defining a function to make predictions on new text data

In [22]:
def fake_news(news):
    news = stemming(news)  # Preprocess the input text with the stemming function
    input_data = [news]  # Wrap in a list as the model expects an array-like input
    vector_form1 = vector_form.transform(input_data)  # Transform input data to TF-IDF vector
    prediction = load_model.predict(vector_form1)  # Make prediction using the loaded model
    return prediction

### Test the fake_news function with a sample input

In [23]:
val = fake_news("""In these trying times, Jackie Mason is the Voice of Reason. 
    In this week’s exclusive clip for Breitbart News, Jackie discusses the looming threat of North Korea, 
    and explains how President Donald Trump could win the support of the Hollywood left if the U.S. needs to strike first.
    """) 

In [24]:
val_1 = fake_news("""In these trying times, Jackie Mason is the Voice of Reason. 
    In this week’s exclusive clip for Breitbart News, Jackie discusses the looming threat of North Korea, 
    and explains how President Donald Trump could win the support of the Hollywood left if the U.S. needs to strike first.
    """) 

### Print the prediction result

In [25]:
if val[0] == 0:
    print('reliable')
else:
    print('unreliable')

reliable


In [26]:
if val_1[0] == 0:
    print('reliable')
else:
    print('unreliable')

reliable
