In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

In [None]:
try:
    from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
    print("Imports successful!")
except ImportError as e:
    print("Import error:", e)


### Importing dataset

In [None]:
df = pd.read_csv(r"C:\Users\aryan\Desktop\Projects\Fake-News-Classification\train.csv")
test = pd.read_csv(r"C:\Users\aryan\Desktop\Projects\Fake-News-Classification\test.csv")

In [None]:
df.head()

### Data preprocessing and cleaning

In [None]:
#filling NULL values with empty string
df=df.fillna('')
test=test.fillna('')

In [None]:
# We will be only using title and author name for prediction
# Creating new coolumn total concatenating title and author
df['total'] = df['title']+' '+df['author']
test['total']=test['title']+' '+test['author']

In [None]:
X = df.drop('label',axis=1)
y=df['label']

assert X.shape[0] == y.shape[0], "Mismatch in number of rows"

y = df['label'].values  # or y = df['label'].squeeze()

print(X.shape)
print(y.shape)

In [None]:
#Choosing vocabulary size to be 5000 and copying data to msg for further cleaning
voc_size = 5000
embedding_dim = 40
lstm_units = 100
dense_units = 64
msg = X.copy()
msg_test = test.copy()

In [None]:
#Downloading stopwords 
#Stopwords are the words in any language which does not add much meaning to a sentence.
#They can safely be ignored without sacrificing the meaning of the sentence.
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
#We will be using Stemming here
#Stemming map words to their root forms
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
corpus_test = []

In [None]:
# Initialize the stemmer and stopwords
stop_words = set(stopwords.words('english'))

# Process each review
for i in range(len(msg)):
    # Text cleaning and preprocessing
    review = re.sub('[^a-zA-Z]', ' ', msg['total'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stop_words]
    review = ' '.join(review)
    corpus.append(review)


In [None]:
# Process each review in the test data
for i in range(len(msg_test)):
    # Text cleaning and preprocessing
    review = re.sub('[^a-zA-Z]', ' ', msg_test['total'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stop_words]
    review = ' '.join(review)
    corpus_test.append(review)


In [None]:
# Converting to one hot representation
onehot_rep = [one_hot(words,voc_size)for words in corpus]
onehot_rep_test = [one_hot(words,voc_size)for words in corpus_test]

In [None]:
#Padding Sentences to make them of same size
embedded_docs = pad_sequences(onehot_rep,padding='pre',maxlen=50)
embedded_docs_test = pad_sequences(onehot_rep_test,padding='pre',maxlen=50)

# Creating and training model

In [None]:
# Initialize the model
model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=40))  # Embedding layer
model.add(Dropout(0.3))                                # Dropout layer
model.add(LSTM(100))                                  # LSTM layer
model.add(Dropout(0.3))                                # Dropout layer
model.add(Dense(64, activation='relu'))                # Dense layer
model.add(Dropout(0.3))                                # Dropout layer
model.add(Dense(1, activation='sigmoid'))              # Output layer

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
print(model.summary())

In [None]:
#Converting into numpy array
X_final = np.array(embedded_docs)
y_final = np.array(y)
test_final = np.array(embedded_docs_test)
X_final.shape,y_final.shape,test_final.shape

print(X_final.shape)  # Should be (num_samples, max_len)
print(y_final.shape)  # Should be (num_samples,)
print(test_final.shape)

In [None]:
#training model
model.fit(X_final,y_final,epochs=20,batch_size=64)

## Creating Submission file 

In [None]:
y_pred = model.predict(test_final)

y_pred_classes = np.argmax(y_pred, axis=1)


In [None]:
final_sub = pd.DataFrame()
final_sub['id']=test['id']
final_sub['label'] = y_pred
final_sub.to_csv('submit.csv',index=False)

In [None]:
final_sub.head()