<a href="https://colab.research.google.com/github/BeanieBeta/ds-content-interactive-jupyterlab-tutorial/blob/master/Fake_News_TfidfVectorizer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing relevant libraries
import numpy as np
import pandas as pd
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from google.colab import drive

In [None]:
# read the datasets from Google Drive
drive.mount('/content/gdrive')
df_fake=pd.read_csv('/content/gdrive/My Drive/Fake.csv')
df_true=pd.read_csv('/content/gdrive/My Drive/True.csv')

Mounted at /content/gdrive


In [None]:
# add a 'label' column filled with the booleans False and True for the fake news and true news datasets respectively
df_fake['label'] = False
df_true['label'] = True
# combines and randomizes the two datasets
combine_set = pd.concat([df_fake,df_true]).sample(frac =  1,random_state = 1)

In [None]:
# Gets the labels from the label column
labels = combine_set.label

In [None]:
# equally splits the combined dataset into test and training datasets; 
# separates text and label columns from the combined data sets
# uses 80% of each divided part of the dataset as a training set 
# the other 20%s become test sets 
# randomizes divided parts of dataset 7 times
x_train,x_test,y_train,y_test=train_test_split(combine_set['text'], labels, test_size=0.2, random_state=7)
cv=CountVectorizer(max_df=0.85,stop_words='english',max_features=10000)
word_count_vector=cv.fit_transform(x_test)
list(cv.vocabulary_.keys())[:10]

['time',
 'cracking',
 'jokes',
 'kids',
 'middle',
 'address',
 'world',
 'horrible',
 'terrorist',
 'attack']

In [None]:
# this basically starts a feature extraction command that turns text documents into TF-IDF features
# TF-IDF features are bascially words that have been given a numerical value to represent how significant they are
# stop_words='english' removes all english stop words from the text i.e. "a", "an", "the", "be", etc.
# max_df = 0.7 makes it so that all words that appear in more than 70% of the given documents are not included when making the features
# it filters these because at a certain point of commonality, certain words just become noise for the model like stop words; they don't help distinguish False articles from True articles 
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
# Basically applies the above filters to the test and training sets for one category of label
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [None]:
# initializes a Passive Agressive Classifier machine learning algorithm and assigns it to "pac"; makes it so that this model will go over training data 50 times; 
# the more times a model runs on a training set, the more accurate it is, but it is important not to overfit the model, so 50 is a reasonable number of times to run it through the training set
# passive-aggressive algorithms are a family of machine learning algorithms for large-scale learning(perfect for our datasets with 20,000+ articles)
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
# predict on the test set and calculate accuracy, recall, precision, and f1
y_pred=pac.predict(tfidf_test)
print(f'Accuracy: {round(accuracy_score(y_test,y_pred)*100,2)}%')
print(f'Recall: {round(recall_score(y_test,y_pred)*100,2)}%')
print(f'Precision: {round(precision_score(y_test,y_pred)*100,2)}%')
print(f'F1: {round(f1_score(y_test,y_pred)*100,2)}%')

Accuracy: 99.4%
Recall: 99.58%
Precision: 99.16%
F1: 99.37%


In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
feature_names=cv.get_feature_names()
tf_idf_vector=tfidf_transformer.transform(cv.transform(x_test))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,100)
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


===Keywords===
watch 1.0
spread 1.0
losers 1.0
great 1.0
divide 1.0
appear 1.0
popick 0.885
fundraiser 0.878
guo 0.861
barrett 0.856
watters 0.852
00pm 0.85
penn 0.847
youtu 0.846
bannon 0.838
electors 0.836
update 0.835
ailes 0.771
veterans 0.824
race 0.823
memes 0.821
ramaphosa 0.82
hannity 0.817
bharara 0.817
tapper 0.815
band 0.815
tyson 0.813
progressives 0.812
edt 0.797
shorter 0.807
words 0.807
senecal 0.803
menendez 0.802
miss 0.801
burns 0.8
dolly 0.799
riina 0.797
flake 0.793
macy 0.775
dayton 0.792
keurig 0.792
shumpert 0.792
franken 0.792
taiwan 0.789
clyburn 0.787
conyers 0.787
rohrabacher 0.787
pakistan 0.786
limit 0.785
saakashvili 0.785
jackson 0.784
mccain 0.783
predictions 0.782
kerry 0.78
cher 0.779
stewart 0.779
johnston 0.778
outrageous 0.778
cuomo 0.778
grayson 0.777
moore 0.777
garland 0.776
boehner 0.775
sessions 0.772
kosovo 0.772
soros 0.769
sabo 0.772
duterte 0.772
lance 0.771
nye 0.771
sarsour 0.77
brotherhood 0.77
gorka 0.769
cruz 0.768
becky 0.767
warmbie

In [None]:
print(sorted_items)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

