<a href="https://colab.research.google.com/github/Ameen2488/NLP_Projects/blob/main/Fake_News_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# Import the necessary modules
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [5]:
# Try using the 'python' engine and skipping bad lines
df = pd.read_csv('/content/fake_or_real_news.csv.crdownload', engine='python', on_bad_lines='skip')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [8]:
# Create a series to store the labels: y
y = df['label']

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size = 0.33, random_state = 53)

# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words = "english")

# Transform the training data using only the 'text' column values: count_train
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values: count_test
count_test = count_vectorizer.transform(X_test)

# Print the first 10 features of the count_vectorizer
# Use get_feature_names_out() for newer scikit-learn versions
print(count_vectorizer.get_feature_names_out()[:10])

['00' '000' '0000' '0002' '000km' '000x' '001' '003' '004' '004s']


In [11]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words ='english', max_df = 0.7)

# Transform the training data: tfidf_train
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test
tfidf_test = tfidf_vectorizer.transform(X_test)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names_out()[:10])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])


['00' '000' '0000' '0002' '000km' '000x' '001' '003' '004' '004s']
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.01270823 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [17]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names_out())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names_out())

# Print the head of count_df
print(count_df.head())

# Print the head of tfidf_df
print(tfidf_df.head())

# Calculate the difference in columns: difference
difference = set(tfidf_df.columns) - set(count_df.columns)
print(difference)

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))


   00  000  0000  0002  000km  000x  001  003  004  004s  ...  ťđ  ελληνικά  \
0   0    0     0     0      0     0    0    0    0     0  ...   0         0   
1   0    0     0     0      0     0    0    0    0     0  ...   0         0   
2   0    1     0     0      0     0    0    0    0     0  ...   0         0   
3   0    0     0     0      0     0    0    0    0     0  ...   0         0   
4   0    0     0     0      0     0    0    0    0     0  ...   0         0   

   октября  русский  эa  эin  эthe  яркий  عربي  ยงade  
0        0        0   0    0     0      0     0      0  
1        0        0   0    0     0      0     0      0  
2        0        0   0    0     0      0     0      0  
3        0        0   0    0     0      0     0      0  
4        0        0   0    0     0      0     0      0  

[5 rows x 43795 columns]
    00       000  0000  0002  000km  000x  001  003  004  004s  ...   ťđ  \
0  0.0  0.000000   0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0   
1  0.

###Training and testing the "fake news" model with CountVectorizer


In [16]:
# Import the necessary modules
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)


0.8848594741613781
[[479  89]
 [ 38 497]]


###Training and testing the "fake news" model with TfidfVectorizer

In [18]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)


0.8068902991840435
[[368 200]
 [ 13 522]]


### Improving The Model

In [22]:
# Create the list of alphas: alphas
alphas = np.arange(0, 1, .1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()


Alpha:  0.0
Score:  0.8676337262012692

Alpha:  0.1
Score:  0.8830462375339981

Alpha:  0.2
Score:  0.8785131459655485

Alpha:  0.30000000000000004
Score:  0.8694469628286491

Alpha:  0.4
Score:  0.8513145965548504

Alpha:  0.5
Score:  0.8368087035358114

Alpha:  0.6000000000000001
Score:  0.828649138712602

Alpha:  0.7000000000000001
Score:  0.8232094288304623

Alpha:  0.8




Score:  0.8213961922030825

Alpha:  0.9
Score:  0.814143245693563



###Inspecting your Model

In [26]:
# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Get the probabilities of features given a class: feature_probs
feature_probs = nb_classifier.feature_log_prob_

# Zip the feature names together with the probabilities and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(feature_probs[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])


FAKE [(-10.925871984197721, '000x'), (-10.925871984197721, '001'), (-10.925871984197721, '003'), (-10.925871984197721, '009'), (-10.925871984197721, '011'), (-10.925871984197721, '012'), (-10.925871984197721, '022'), (-10.925871984197721, '024'), (-10.925871984197721, '027'), (-10.925871984197721, '042'), (-10.925871984197721, '044'), (-10.925871984197721, '047'), (-10.925871984197721, '075'), (-10.925871984197721, '0843'), (-10.925871984197721, '091'), (-10.925871984197721, '093'), (-10.925871984197721, '1001'), (-10.925871984197721, '101st'), (-10.925871984197721, '102m'), (-10.925871984197721, '103rd')]
REAL [(-8.277162792028813, 'american'), (-8.266941593057535, 'donald'), (-8.24214687051742, 'time'), (-8.233549816528546, 'october'), (-8.206806285567117, 'war'), (-8.165159452062984, 'emails'), (-8.16336276236144, 'world'), (-8.161371261080067, 'new'), (-8.140642098946163, 'government'), (-8.132888142109032, 'like'), (-8.098779003578837, 'russia'), (-8.061346502933967, 'just'), (-8.