In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('news/news.csv')
df.head()
df = df.drop(df.columns[0],axis=1)
df.head()
df['news'] = df['title'] + ' ' + df['text']
df.head()
convert_to_binary = {'REAL':1,'FAKE':0}
df['label'] = df['label'].map(convert_to_binary)
df.head()
df = df.drop([df.columns[0],df.columns[1]],axis=1)
df.head()
df = df.reindex(columns=['news','label'])
df.head()
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

# Creating a list of stopwords (common words which are not important to the overall meaning of the article).
stop_words = stopwords.words('english')
stop_words.extend(['the','it','in'])

# Lemmatization is the process of reducing words to their basic form, e.g. dogs->dog, running->run.
WNL = WordNetLemmatizer()

# We now iterate through each article in the dataset.
for index, row in df.iterrows():
    filtered_article = ''
    article = row['news']
    # Removing all punctuation using RegEx.
    article = re.sub(r'[^\w\s]', '', article)
    # Tokenization: changing the article from one continuous string to a list of tokens (words).
    words = [word.lower() for word in nltk.word_tokenize(article)]
    # Removing stopwords.
    words = [word for word in words if not word in stop_words]
    # Lemmatization and concatenating the words back into a single string.
    words_lemmatized = []
    for word in words:
        if word == 'us':
            words_lemmatized.append(word)
        else:
            words_lemmatized.append(WNL.lemmatize(word))
    filtered_article = " ".join([word for word in words_lemmatized])
    
    # Replace the article with the new filtered version.
    df.loc[index, 'news'] = filtered_article
    
df.head()
df_input = df['news']
df_output = df['label']
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(df_input)
tf_idf_matrix
tf_idf_matrix.toarray()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [2]:
label_column = df.loc[:,'label']
labels = label_column.values
print(labels)

[0 0 1 ... 0 1 1]


In [3]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
#Separating our features and targets 
x = tf_idf_matrix
y = df_output.values

#Splitting our dataset into training and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

#Defining our model with a regularisation parameter
model = PassiveAggressiveClassifier(C=0.5, random_state=0)

In [5]:
#Fitting our model with our training data
model.fit(x_train, y_train)
model.score(x_train, y_train)

1.0

In [6]:
#Making Predictions
y_predict = model.predict(x_test)
y_predict

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

In [7]:
#Implementing our model on our test data
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9389794844818516

In [8]:
#Measuring the performance of our model
print("accuracy: %0.3f" % (accuracy_score(y_test, y_predict)))

accuracy: 0.939
