# Using Gaussian Processes to classify fake/real news
https://www.kaggle.com/rtatman/fake-news-fake-news

https://www.kaggle.com/rtatman/fake-news-fake-news/data

https://www.kaggle.com/rtatman/fake-news-fake-news/kernels




In [2]:
## 1. Importing libraries and data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [23]:
## 2. Importing the fake news dataset
dataset = pd.read_csv('Datasets/fake-news/train.csv')
dataset = dataset.dropna()
dataset = dataset.reset_index(drop=True)

# remove nan values
dataset = dataset.dropna()
dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1


In [24]:
## 3. Cleaning the text
corpus = []
for i in range(0, len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [27]:
# export the corpus to a csv file
corpus = pd.DataFrame(corpus, columns=['text'])
corpus.to_csv('Datasets/fake-news/corpus.csv', index=False)

In [17]:
# import the corpus from a csv file
corpus = pd.read_csv('Datasets/fake-news/corpus.csv')

In [28]:
## 4. Creating the Bag of Words model for the corpus
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus['text']).toarray()
y = dataset.iloc[:, 4].values



In [29]:
## 5. Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [30]:
## 6. Fitting the classifier to the Training set
classifier = GaussianProcessClassifier(1.0 * RBF(1.0))
classifier.fit(X_train, y_train)

In [31]:
## 7. Predicting the Test set results
y_pred = classifier.predict(X_test)

In [32]:
## 8. Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

[[1985  123]
 [  99 1450]]
0.9392945036915504


In [33]:
## 9. Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies.mean())
print(accuracies.std())

  - np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum()


KeyboardInterrupt: 

In [None]:
# reapating this gaussian process but with an extra synthetic features

In [None]:
## 10. Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

In [None]:
## 11. Applying Random Search to find the best model and the best parameters
from sklearn.model_selection import RandomizedSearchCV
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
random_search = RandomizedSearchCV(estimator = classifier, param_distributions = parameters, scoring = 'accuracy', cv = 10)
random_search = random_search.fit(X_train, y_train)
best_accuracy = random_search.best_score_
best_parameters = random_search.best_params_
print(best_accuracy)
print(best_parameters)