# Problem Statement
Develop a DL Model to identify when an article might be fake news

In [None]:
!pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.9.4-cp310-cp310-win_amd64.whl.metadata (3.5 kB)
Downloading wordcloud-1.9.4-cp310-cp310-win_amd64.whl (299 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.4


In [None]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 480.1 kB/s eta 0:00:03
   -------------------- ------------------- 0.8/1.5 MB 578.7 kB/s eta 0:00:02
   -----

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import nltk # natural lang toolkit
import warnings
%matplotlib inline


warnings.filterwarnings('ignore')

In [None]:
from google.colab import files
uploaded = files.upload()

TypeError: 'NoneType' object is not subscriptable

In [None]:
#df = pd.read_csv(r"C:\Users\Archit\Documents\Projects\ML Projects\Fake News Detector\fake-news-data\train.csv")
df = pd.read_csv("train.csv")
df.head()

In [None]:
df['title'][0]

In [None]:
df['text'][0]

In [None]:
df.info()

# Preprocesin

In [None]:
df = df.drop(columns=['id','title','author'], axis = 1)

In [None]:
df = df.dropna(axis = 0)

In [None]:
len(df)

In [None]:
df['clean_news'] = df['text'].str.lower()
df['clean_news']

In [None]:
# df['clean_news'].str.replace('[^A-Za-z0-9\s]', '')
df['clean_news'] = df['clean_news'].str.replace(r'[^\w\s]', ' ', regex=True)
df['clean_news'] = df['clean_news'].str.replace('\n', '')
df['clean_news'] = df['clean_news'].str.replace('\s+', '')
df['clean_news']

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['clean_news'] = df["clean_news"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
# x- single sample from text. . .  splitting article into single words, remove stopwords, joining remaining words back into sentance
df.head()

# Exploratory Data Analysis

In [None]:
# Freq words
all_words = ' '.join([sentence for sentence in df['clean_news']])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

plt.figure(figsize=(15,9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Genuine News
all_words = ' '.join([sentence for sentence in df['clean_news'] [ df['label']==0 ]])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

plt.figure(figsize=(15,9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Fake News
all_words = ' '.join([sentence for sentence in df['clean_news'] [ df['label']==1 ]])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

plt.figure(figsize=(15,9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Word Embeddings

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Tokenizing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_news'])
word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size

In [None]:
# padding data
sequences = tokenizer.texts_to_sequences(df['clean_news'])
padded_seq = pad_sequences(sequences, maxlen=500, padding='post', truncating='post')
padded_seq

In [None]:
#from google.colab import files
#uploaded = files.upload()

In [None]:
# embedding index
embedding_index = {}
#embeddings_file = r"C:\Users\Archit\Documents\Projects\ML Projects\Fake News Detector\glove.6B.100d.txt"
embeddings_file_ColabV = "glove.6B.100d.txt"
with open(embeddings_file_ColabV, encoding = 'utf-8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embedding_index[word] = coefs

In [None]:
# embedding matrix
embedding_matrix = np.zeros((vocab_size+1, 100))
for word, i in word_index.items():
  embedding_vector = embedding_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix[1]

# Input split

In [None]:
padded_seq[1]

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(padded_seq, df['label'], test_size=0.20, random_state=42, stratify=df['label'])

In [None]:
x_train[0]

In [None]:
y_train[0]

# Model

In [None]:
from keras.layers import LSTM,Dense, Dropout, Embedding
from keras import Sequential

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size + 1, output_dim=100,
              weights=[embedding_matrix], input_shape=(500,), trainable=False),
    Dropout(0.2),
    LSTM(128, return_sequences=True),
    LSTM(128),
    Dropout(0.2),
    Dense(512),
    Dropout(0.2),
    Dense(256),
    Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
#model.build(input_shape=(None, 500))
model.summary()

In [None]:
history = model.fit(x_train,y_train, epochs = 10, batch_size = 128, validation_data=[x_test, y_test])

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend(['Train','Test'])
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(['Train','Test'])
plt.show()