**Abhina Premachandran Bindu**
**April 28 2024**
# Using shap to understand the Classification criteria
## Loading and initial cleaning

In [None]:
#pip install gensim numpy scikit-learn shap

In [None]:
# importing the libraries
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
shap.initjs()
import transformers
import keras
from wordcloud import WordCloud
from tensorflow.python.keras.engine import data_adapter
from tensorflow.python.keras.engine.keras_tensor import KerasTensor
# import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')

In [None]:
#importing the data
data = pd.read_csv(input('Enter the file path for the csv file:'))
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# dropping the na values
data.dropna(inplace=True)

In [None]:
# dropping the redundant 'Unnamed: 0' column
data.drop(columns='Unnamed: 0',inplace=True)

In [None]:
# checking the value counts of 'target' to check for data imbalance
data.target.value_counts()

 Since the number of Fake and True classes are almost same, there is no class imbalance

In [None]:
data.subject.value_counts()

## Data Preprocessing

In [None]:
# encoding the class labels to numerical - Real:1 and Fake:0
class_mapping = {label: idx for idx, label in enumerate(np.unique(data['target']))}
data['target'] = data['target'].map(class_mapping)
data.head()

In [None]:
# Apply the function across the DataFrame
data['cleaned_text'] = data['text'].apply(gensim.utils.simple_preprocess)


In [None]:
data.head()

## Visualizing the data

In [None]:
# defining the real dataset and fake dataset
data_real = data[data['target']==1]
data_fake = data[data['target']==0]
# visualize the distribution of subjects in both real and fake data

fig, axs = plt.subplots(2, 1, figsize=(12, 12))

# Plot the subjects in real news on the first subplot
axs[0].hist(data_real['subject'],bins=len(data_real.subject.unique()), align = 'mid', edgecolor='black')
axs[0].set_xlabel('subjects')
axs[0].set_ylabel('Frequency')
axs[0].set_title('Subject distribution in Real News')
axs[0].legend(['Real News'])

# Plot the subjects in fake news on the second subplot
axs[1].hist(data_fake['subject'],bins=len(data_fake.subject.unique()), align = 'mid', edgecolor='black', color = 'red')
axs[1].set_xlabel('subjects')
axs[1].set_ylabel('Frequency')
axs[1].set_title('Subject distribution in Fake News')
axs[1].legend(['Fake News'])


plt.tight_layout()
plt.show()

 From the histogram of subjects above, it is clear that there are only two subject areas where most of the real news is focused on - 'politicsNews' and 'worldnews'. Whereas, the fake news spans to a wide variety of subject areas - 'News','politics','left-news','Government News', 'US_News','Middle-east'. Most of them are not regular news subject areas- indicating the fakeness. 

In [None]:
# using the Counter function to get the count of words to find the most frequent words 
from collections import Counter
all_real_words = []
all_fake_words = []
for i,text in enumerate(data['cleaned_text']):
    for word in text:
        if data.iloc[i,2] == 1:
            all_real_words.append(word)
        else:
            all_fake_words.append(word)

In [None]:
word_counts_real = Counter(all_real_words)
word_counts_fake = Counter(all_fake_words)

most_common_words_real = word_counts_real.most_common(20)
most_common_words_fake = word_counts_fake.most_common(20)

In [None]:
# defining separate new dataframes for most common words in the rela and fake text classes
common_words_realdf = pd.DataFrame(most_common_words_real, columns=['Word', 'Frequency'])
common_words_fakedf = pd.DataFrame(most_common_words_fake, columns=['Word', 'Frequency'])

fig, axs = plt.subplots(2, 1, figsize=(12, 12))

# Plot the most frequent words in real news on the first subplot
axs[0].bar(common_words_realdf['Word'], common_words_realdf['Frequency'], color = 'blue')
axs[0].set_xlabel('Word')
axs[0].set_ylabel('Frequency')
axs[0].set_title('Most Frequent Words in Real News')
axs[0].tick_params(axis='x', rotation=45)  # Rotate x-axis labels for readability
axs[0].legend(['Real News'])

# Plot the most frequent words in fake news on the second subplot
axs[1].bar(common_words_fakedf['Word'], common_words_fakedf['Frequency'], color = 'red')
axs[1].set_xlabel('Word')
axs[1].set_ylabel('Frequency')
axs[1].set_title('Most Frequent Words in Fake News')
axs[1].tick_params(axis='x', rotation=45)  
axs[1].legend(['Fake News'])

plt.tight_layout()
plt.show()

In [None]:
# getting the words in 
text_real = ' '.join(data_real.text)
text_fake = ' '.join(data_fake.text)

In [None]:
# Create and generate a word cloud image:
wordcloud1 = WordCloud().generate(text_real)
wordcloud2 = WordCloud().generate(text_fake)

# Display the generated image:
fig, axs = plt.subplots(2, 1, figsize=(12, 12))
axs[0].imshow(wordcloud1, interpolation='bilinear')
axs[0].set_title('Wordcloud of real news data')
axs[1].imshow(wordcloud2, interpolation='bilinear')
axs[1].set_title('Wordcloud of fake news data')
# plt.axis("off")
plt.show()

<p>Both the barcharts and the wordclouds above indicates that the most frequent words in both real and fake news comprise of the same words. Therefore, one can't conclude the news to be fake or real based on the distribution of words alone. </p>The only words that are different in both classes is 'Government' and 'reuters'. 'reuters' indicate the name of a credible news source, while 'Government' indicate the authority. Therefore, the fake news fails to indicate the credibility of the news they convey by citing the authority or having a credible identity. 

## Building, training and using the gensim word2vect model for getting the word vectors

In [None]:
# building the word2vec model
model = gensim.models.Word2Vec(
    window = 6,
    min_count = 1,
    workers = 4
)
model.build_vocab(data['cleaned_text'])

In [None]:
# training the model
model.train(data['cleaned_text'], total_examples=model.corpus_count, epochs=5)

# saving the model
model.save("word2vec/word2vec_model")

In [None]:
model.wv.index_to_key[:5]

In [None]:
len(model.wv.index_to_key)

In [None]:
# a function for finding the average of the word vectors 
def get_average_word2vec_vector(text, model, word_dim):
  vec = np.zeros((word_dim,))  
  count = 0
  for word in text:
    if word in model.wv:  
      vec += model.wv[word]
      count += 1
  if count != 0:
    vec /= count  
  return vec

# Get word dimensions from the model
word_dim = model.vector_size

# Apply the function to each cleaned_text
word_vectors = [get_average_word2vec_vector(text, model, word_dim) for text in data['cleaned_text']]


In [None]:
# adding the word vectors to the data
data['word_vectors'] = word_vectors

In [None]:
data.head()

## Classifying the data

In [None]:
# importing necessary libraries for model building
from sklearn.model_selection import train_test_split

In [None]:
# defining X and y 
X = word_vectors
y = data['target'].values

In [None]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44)

In [None]:
# reshaping the input values for classifying
X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)
X_train_2d.shape , X_test_2d.shape

In [None]:
# importing the model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
# defining the model, fitting and predicting value for X_test.
clf = GradientBoostingClassifier()
#clf = RandomForestClassifier()

clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)
# printing the classification report for validation of the model
print(classification_report(y_test, y_pred))

 The Graident Boosting classifier classifies the text data as real or fake with an accuracy score of 96%.
 ## Understanding the performance of the classifier - using shap

In [None]:
explainer = shap.Explainer(clf, X_train_2d)
shap_values = explainer(X_test_2d)
shap.plots.waterfall(shap_values[0])

 The shap plot(waterfall) of the shap values indicate that Features 94,93, and 82 are the ones that mostly decided whether a text is real or fake. Further analysis need to be done to find the words corresponding to the features used. 