In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

In [22]:
try:
    from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
    print("Imports successful!")
except ImportError as e:
    print("Import error:", e)


Imports successful!


### Importing dataset

In [23]:
df = pd.read_csv(r"C:\Users\..file path..\train.csv")
test = pd.read_csv(r"C:\Users\..file path..\test.csv")

In [24]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


### Data preprocessing and cleaning

In [25]:
#filling NULL values with empty string
df=df.fillna('')
test=test.fillna('')

In [26]:
# We will be only using title and author name for prediction
# Creating new coolumn total concatenating title and author
df['total'] = df['title']+' '+df['author']
test['total']=test['title']+' '+test['author']

In [27]:
X = df.drop('label',axis=1)
y=df['label']

assert X.shape[0] == y.shape[0], "Mismatch in number of rows"

y = df['label'].values  # or y = df['label'].squeeze()

print(X.shape)
print(y.shape)

(20800, 5)
(20800,)


In [28]:
#Choosing vocabulary size to be 5000 and copying data to msg for further cleaning
voc_size = 5000
embedding_dim = 40
lstm_units = 100
dense_units = 64
msg = X.copy()
msg_test = test.copy()

In [29]:
#Downloading stopwords 
#Stopwords are the words in any language which does not add much meaning to a sentence.
#They can safely be ignored without sacrificing the meaning of the sentence.
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
#We will be using Stemming here
#Stemming map words to their root forms
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
corpus_test = []

In [31]:
# Initialize the stemmer and stopwords
stop_words = set(stopwords.words('english'))

# Process each review
for i in range(len(msg)):
    # Text cleaning and preprocessing
    review = re.sub('[^a-zA-Z]', ' ', msg['total'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stop_words]
    review = ' '.join(review)
    corpus.append(review)


In [32]:
# Process each review in the test data
for i in range(len(msg_test)):
    # Text cleaning and preprocessing
    review = re.sub('[^a-zA-Z]', ' ', msg_test['total'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stop_words]
    review = ' '.join(review)
    corpus_test.append(review)


In [33]:
# Converting to one hot representation
onehot_rep = [one_hot(words,voc_size)for words in corpus]
onehot_rep_test = [one_hot(words,voc_size)for words in corpus_test]

In [34]:
#Padding Sentences to make them of same size
embedded_docs = pad_sequences(onehot_rep,padding='pre',maxlen=50)
embedded_docs_test = pad_sequences(onehot_rep_test,padding='pre',maxlen=50)

# Creating and training model

In [35]:
# Initialize the model
model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=40))  # Embedding layer
model.add(Dropout(0.3))                                # Dropout layer
model.add(LSTM(100))                                  # LSTM layer
model.add(Dropout(0.3))                                # Dropout layer
model.add(Dense(64, activation='relu'))                # Dense layer
model.add(Dropout(0.3))                                # Dropout layer
model.add(Dense(1, activation='sigmoid'))              # Output layer

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
print(model.summary())

None


In [36]:
#Converting into numpy array
X_final = np.array(embedded_docs)
y_final = np.array(y)
test_final = np.array(embedded_docs_test)
X_final.shape,y_final.shape,test_final.shape

print(X_final.shape)  # Should be (num_samples, max_len)
print(y_final.shape)  # Should be (num_samples,)
print(test_final.shape)

(20800, 50)
(20800,)
(5200, 50)


In [37]:
#training model
model.fit(X_final,y_final,epochs=20,batch_size=64)

Epoch 1/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.8763 - loss: 0.2796
Epoch 2/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9950 - loss: 0.0185
Epoch 3/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9975 - loss: 0.0105
Epoch 4/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9989 - loss: 0.0045
Epoch 5/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9992 - loss: 0.0027
Epoch 6/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.9993 - loss: 0.0023
Epoch 7/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.9995 - loss: 0.0016
Epoch 8/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.9993 - loss: 0.0018
Epoch 9/20
[1m325/325[0m [32m

<keras.src.callbacks.history.History at 0x1c9acffe600>

## Creating Submission file 

In [38]:
y_pred = model.predict(test_final)

y_pred_classes = np.argmax(y_pred, axis=1)


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step


In [39]:
final_sub = pd.DataFrame()
final_sub['id']=test['id']
final_sub['label'] = y_pred
final_sub.to_csv('submit.csv',index=False)

In [40]:
final_sub.head()

Unnamed: 0,id,label
0,20800,2.988461e-12
1,20801,1.0
2,20802,1.0
3,20803,2.851351e-14
4,20804,1.0
