In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import psycopg2 as psycopg2
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import *
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix 
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import time

In [3]:
startTid = time.time()

In [4]:
# Connect to an existing database
conn = None
try:
    conn = psycopg2.connect(#database="public",
                        port = "5432",
                        host="localhost", 
                        user="postgres", 
                        password="1234")
except psycopg2.DatabaseError as e: 
        print (f'Error {e}')
        sys.exit(1)
            
    # Open a cursor to perform database operations
cur = conn.cursor()

    # Execute a command: this creates a new table

cur.execute(("""SELECT type, content FROM article WHERE type IN ('fake','satire','bias','conspiracy','junksci','clickbait','political','reliable','state') """)) #ORDER by random() LIMIT {};""".format(totalRow)))
df = (cur.fetchmany(10000))

    # Make the changes to the database persistent
conn.commit()

    # Close communication with the database
cur.close()
conn.close()

In [5]:
df_articles = pd.DataFrame(df, columns =["type", "article"])

In [6]:
# Sorting our y variables into FAKE or REAL
df_articles["type"] = np.where(df_articles["type"] == "hate", "FAKE", df_articles["type"])
df_articles["type"] = np.where(df_articles["type"] == "fake", "FAKE", df_articles["type"])
df_articles["type"] = np.where(df_articles["type"] == "unreliable", "FAKE", df_articles["type"])
df_articles["type"] = np.where(df_articles["type"] == "conspiracy", "FAKE", df_articles["type"])

df_articles["type"] = np.where(df_articles["type"] == "satire", "FAKE", df_articles["type"])
df_articles["type"] = np.where(df_articles["type"] == "junksci", "FAKE", df_articles["type"])
df_articles["type"] = np.where(df_articles["type"] == "bias", "FAKE", df_articles["type"])

df_articles["type"] = np.where(df_articles["type"] == "clickbait", "REAL", df_articles["type"])
df_articles["type"] = np.where(df_articles["type"] == "political", "REAL", df_articles["type"])

df_articles["type"] = np.where(df_articles["type"] == "reliable", "REAL", df_articles["type"])
df_articles["type"] = np.where(df_articles["type"] == "state", "REAL", df_articles["type"])

df_articles["type"] = np.where(df_articles["type"] != "FAKE", "REAL", df_articles["type"])

In [7]:
print(df_articles.shape)

(10000, 2)


In [8]:
stemmer = PorterStemmer()
words = stopwords.words("english")
Data_transformed = df_articles['article'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [9]:
# Creating a vectorized array with integers indicating frequency of words.
vectorize = CountVectorizer(max_features=200,ngram_range=(1,3))
x = vectorize.fit_transform(Data_transformed).toarray()
y = np.array(df_articles['type'])

In [10]:
# Creating a test split of 80/20.
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=34)
print('\nTotal number of samples in the Train Dataset :',xtrain.shape[0])
print('Total number of samples in the Test Dataset :',xtest.shape[0])


Total number of samples in the Train Dataset : 8000
Total number of samples in the Test Dataset : 2000


In [11]:
# Second baseline model using multinomial naive bayes.
mulNB = MultinomialNB().fit(xtrain, ytrain)
print('\nAccuracy score for Multinomial Naive Bayes :',mulNB.score(xtest,ytest))

ypred_mul = mulNB.predict(xtest)

print('Confusion Metrics for Multinomial Naive Bayes : \n\n',confusion_matrix(ytest, ypred_mul),'\n\n')
print('Classification Report for Multinomial Naive Bayes :\n\n',classification_report(ytest,ypred_mul))


Accuracy score for Multinomial Naive Bayes : 0.713
Confusion Metrics for Multinomial Naive Bayes : 

 [[465 197]
 [377 961]] 


Classification Report for Multinomial Naive Bayes :

               precision    recall  f1-score   support

        FAKE       0.55      0.70      0.62       662
        REAL       0.83      0.72      0.77      1338

    accuracy                           0.71      2000
   macro avg       0.69      0.71      0.69      2000
weighted avg       0.74      0.71      0.72      2000



In [12]:
# Third baseline model using Logistic regression.
scaler = StandardScaler()
lr = LogisticRegression(max_iter = 600)
log_model = Pipeline([('standardize', scaler),
                    ('log_reg', lr)])

log_model.fit(xtrain, ytrain)

ypred_log = log_model.predict(xtest)
test_accuracy = accuracy_score(ytest, ypred_log)*100

print('Testing accuracy for Logistic regression: %.4f %%' % test_accuracy) 
print('Confusion Metrics for Logistic regression:\n\n', confusion_matrix(ytest, ypred_log))
print('Classification Report for Multinomial Naive Bayes :\n\n',classification_report(ytest,ypred_log))

Testing accuracy for Logistic regression: 78.0000 %
Confusion Metrics for Logistic regression:

 [[ 347  315]
 [ 125 1213]]
Classification Report for Multinomial Naive Bayes :

               precision    recall  f1-score   support

        FAKE       0.74      0.52      0.61       662
        REAL       0.79      0.91      0.85      1338

    accuracy                           0.78      2000
   macro avg       0.76      0.72      0.73      2000
weighted avg       0.77      0.78      0.77      2000



In [13]:
# from sklearn.svm import SVC
# from sklearn.pipeline import make_pipeline
# from sklearn import preprocessing
# from sklearn.preprocessing import LabelBinarizer, LabelEncoder

# clf2 = make_pipeline(StandardScaler(), SVC(C=1, gamma='auto', kernel='linear'))
# clf2.fit(xtrain, ytrain)
# predict2 = clf2.predict(xtest)

# test_accuracy2 = accuracy_score(ytest, predict2)*100
# print('Testing accuracy for SVM: %.4f %%' % test_accuracy2) 
# print('Testing accuracy for SVM: %.4f %%' % test_accuracy2) 

In [14]:
# Har erstattet den ovenstående SVC model med denne, efter råd fra sklearns hjemmeside. 
from sklearn import svm

clfsvm = svm.LinearSVC(max_iter=80000)
clfsvm.fit(xtrain, ytrain)
clfsvm.score(xtrain, ytrain)

predicSVM = clfsvm.predict(xtest)
clfsvm_acc = accuracy_score(ytest, predicSVM)*100

print('Testing accuracy for svm: %.4f %%' % clfsvm_acc) 

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf3 = RandomForestClassifier(max_depth=50, random_state=1)
clf3.fit(xtrain, ytrain)

predictForest = clf3.predict(xtest)
test_accuracy3 = accuracy_score(ytest, predictForest)*100

print('Testing accuracy for Forest: %.4f %%' % test_accuracy3) 

Testing accuracy for Forest: 84.1000 %


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
param_grid = {'C': [0.1, 1],
              'gamma': [1, 0.1],
              'kernel': ['rbf', 'linear']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
# fitting the model for grid search
grid.fit(xtrain, ytrain)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.676 total time=   3.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.680 total time=   3.1s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.678 total time=   3.1s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.679 total time=   3.3s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.674 total time=   3.1s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.781 total time=   5.8s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.761 total time=   4.7s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.759 total time=   5.1s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.758 total time=   4.2s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.768 total time=   5.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.675 total time=   2.6s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;,

In [None]:
print (grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

{'C': 1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=1, gamma=1, kernel='linear')
0.8150000000000001


In [None]:
slutTid = time.time()
kage2 = slutTid-startTid
print(kage2)
print("Tid i min:" ,kage2/60)

18.104356050491333
Tid i min: 0.30173926750818886
