In [1]:
# Import the necessary modules
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("..data/WELFake_Dataset.csv")
print(df.head())

   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


In [4]:
df = df.dropna()

#### CountVectorizer for text classification 

In [5]:
# Create a series to store the labels: y
y = df['label']
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'],y,
test_size=0.33, random_state=53)

# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words='english')
# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train.values)
# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test.values)
# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])


['00', '000', '0000', '000000031', '00000031', '000035', '00004', '000048', '000063', '00007']




#### TfidfVectorizer for text classification
Similar to the sparse CountVectorizer created in the previous exercise, you'll work on creating tf-idf vectors for your documents. You'll set up a TfidfVectorizer and investigate some of its features.

In [6]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english',max_df=0.7)
# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test.values)
# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])
# Print the first 5 vectors of the tfidf training data
print(tfidf_train[:5])

['00', '000', '0000', '000000031', '00000031', '000035', '00004', '000048', '000063', '00007']
  (0, 79763)	0.0448134380402158
  (0, 67655)	0.0477897152422986
  (0, 105801)	0.06791117493695357
  (0, 57445)	0.09116414058408855
  (0, 79756)	0.035968349036515995
  (0, 60550)	0.03838432835123006
  (0, 114531)	0.07086505813835578
  (0, 151555)	0.056930102839383226
  (0, 61514)	0.07551120228367868
  (0, 126009)	0.04606423926784623
  (0, 158182)	0.043644892131251924
  (0, 169706)	0.04082775987417706
  (0, 82134)	0.040079685308456875
  (0, 159926)	0.0446290201996305
  (0, 22561)	0.051713015274712094
  (0, 19349)	0.03799905356638148
  (0, 5403)	0.05576420433807418
  (0, 60643)	0.06643576263992591
  (0, 39642)	0.05114711087344502
  (0, 40953)	0.040224337764138786
  (0, 41927)	0.08655782644474604
  (0, 20577)	0.0665522893065148
  (0, 128656)	0.07157295982801841
  (0, 175755)	0.03043408330642221
  (0, 55895)	0.06601753930461074
  :	:
  (4, 55029)	0.04621917150706667
  (4, 81779)	0.0529988808458487

#### Inspecting the vectors
To get a better idea of how the vectors work, you'll investigate them by converting them into pandas DataFrames.

In [None]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df =  pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
# Print the head of count_df
print(count_df.head())
# Print the head of tfidf_df
print(tfidf_df.head())
# calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)
# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))


#### raining and testing the "fake news" model with CountVectorizer
Now it's your turn to train the "fake news" model using the features you identified and extracted. In this first exercise you'll train and test a Naive Bayes model using the CountVectorizer data.

In [8]:
# Import the necessary module
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)
# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0,1])
print(cm)


0.8900796340223653
[[10493  1050]
 [ 1545 10520]]


#### Training and testing the "fake news" model with TfidfVectorizer
Now that you have evaluated the model using the CountVectorizer, you'll do the same using the TfidfVectorizer with a Naive Bayes model.

In [10]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)
# Create the predicted tags: pred
pred =  nb_classifier.predict(tfidf_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[0,1])
print(cm)


0.866867163673331
[[10100  1443]
 [ 1700 10365]]


#### Improving your model
Your job in this exercise is to test a few different alpha levels using the Tfidf vectors to determine if there is a better performing combination.

In [11]:
# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()


Alpha:  0.0
Score:  0.891562182311081

Alpha:  0.1
Score:  0.8805913249745849

Alpha:  0.2


  % _ALPHA_MIN


Score:  0.8770755676042019

Alpha:  0.30000000000000004
Score:  0.874364622161979

Alpha:  0.4
Score:  0.8722466960352423

Alpha:  0.5
Score:  0.8711453744493393

Alpha:  0.6000000000000001
Score:  0.8703405625211793

Alpha:  0.7000000000000001
Score:  0.8691968824127414

Alpha:  0.8
Score:  0.8686462216197899

Alpha:  0.9
Score:  0.8675872585564216



#### Inspecting your model
Now that you have built a "fake news" classifier, you'll investigate what it has learned. You can map the important vector weights back to actual words using some simple inspection techniques.

In [12]:
# Get the class labels: class_labels
class_labels = nb_classifier.classes_
# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()
# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))
# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])
# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])


0 [(-12.973138524212128, '000000031'), (-12.973138524212128, '00000031'), (-12.973138524212128, '00004'), (-12.973138524212128, '000063'), (-12.973138524212128, '00042'), (-12.973138524212128, '0009'), (-12.973138524212128, '000cases'), (-12.973138524212128, '000ft'), (-12.973138524212128, '000m'), (-12.973138524212128, '000x'), (-12.973138524212128, '00106'), (-12.973138524212128, '00155'), (-12.973138524212128, '0019'), (-12.973138524212128, '00193'), (-12.973138524212128, '001st'), (-12.973138524212128, '0020'), (-12.973138524212128, '0024'), (-12.973138524212128, '00458'), (-12.973138524212128, '0050'), (-12.973138524212128, '005380')]
1 [(-7.414572460259855, 'state'), (-7.3929470644789514, 'white'), (-7.385637235987291, 'media'), (-7.384880760087839, 'time'), (-7.376661963703968, 'campaign'), (-7.374418879355726, '2016'), (-7.371352750938148, 'america'), (-7.3435049592447434, 'new'), (-7.322418475677631, 'news'), (-7.279459070867028, 'election'), (-7.1575736818489535, 'donald'), (

