In [359]:
import pandas as pd 
import numpy as np 
import re 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
from gensim.models import Word2Vec
stemmer = PorterStemmer()


In [360]:
df = pd.read_csv('news_dataset.csv')

In [361]:
df.head()

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...


In [362]:
df['text'][0]

'Payal has accused filmmaker Anurag Kashyap of behaving inappropriately with her in a video that went viral. She maintained her stance while speaking to ETimes and said, “I have wanted to speak about this for a long time. But today, finally I thought I must get it off my head. I had tweeted about my incident sometime ago when the #MeToo movement had happened, but many people told me to delete the tweet else I would stop getting work. My manager too advised me to remove the tweet. I complied. Post that, Anurag blocked me on WhatsApp.”'

In [363]:
df.isnull().sum()

label    0
text     8
dtype: int64

In [364]:
df.dropna(inplace=True)

In [365]:
df[df['label']=='REAL']['label'].count()

1850

In [366]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3721 entries, 0 to 3728
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3721 non-null   object
 1   text    3721 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [367]:
len(df)

3721

In [368]:
# df['label'] = df['label'].map({'REAL':0 , 'FAKE':1})
def assign_label_text(text):
    if 'fake' in text.lower():
        return 1 
    else:
        return 0 

In [369]:
df['label'] = df['text'].apply(assign_label_text)

In [370]:
df['label']


0       0
1       1
2       1
3       0
4       0
       ..
3724    0
3725    0
3726    0
3727    0
3728    0
Name: label, Length: 3721, dtype: int64

In [371]:
corpus = []
for i in range(len(df)):
    text = df.iloc[i]['text']
    text = re.sub('a-zA-z',' ',text)
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    corpus.append(text)

    

    
    

In [372]:
word_tokenise = [sentence.lower().split() for sentence in corpus]

In [373]:
word_tokenise[0]

['payal',
 'accus',
 'filmmak',
 'anurag',
 'kashyap',
 'behav',
 'inappropri',
 'video',
 'went',
 'viral.',
 'maintain',
 'stanc',
 'speak',
 'etim',
 'said,',
 '“i',
 'want',
 'speak',
 'long',
 'time.',
 'today,',
 'final',
 'thought',
 'must',
 'get',
 'head.',
 'tweet',
 'incid',
 'sometim',
 'ago',
 '#metoo',
 'movement',
 'happened,',
 'mani',
 'peopl',
 'told',
 'delet',
 'tweet',
 'els',
 'would',
 'stop',
 'get',
 'work.',
 'manag',
 'advis',
 'remov',
 'tweet.',
 'complied.',
 'post',
 'that,',
 'anurag',
 'block',
 'whatsapp.”']

In [374]:
corpus[0]

'payal accus filmmak anurag kashyap behav inappropri video went viral. maintain stanc speak etim said, “i want speak long time. today, final thought must get head. tweet incid sometim ago #metoo movement happened, mani peopl told delet tweet els would stop get work. manag advis remov tweet. complied. post that, anurag block whatsapp.”'

In [375]:
wordmodel = Word2Vec(sentences=word_tokenise, vector_size=20, window=5, min_count=1, sg=0)
# model.build_vocab(word_tokenise)
wordmodel.train(word_tokenise, total_examples=wordmodel.corpus_count, epochs=20)



(19567036, 19893080)

In [376]:
wordmodel.save("word2vec_model.model")

In [377]:
X = np.array([np.mean([wordmodel.wv[word] for word in words if word in wordmodel.wv] or [np.zeros(wordmodel.vector_size)], axis=0) for words in word_tokenise])

In [378]:

# vocab = model.wv.index_to_key

# word_vectors = [model.wv[word] for word in vocab]
X = np.expand_dims(X, axis=1)

In [379]:
y = np.array(df['label'])

In [380]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.33, random_state = 42 )

In [381]:
y_train= y_train.reshape(-1, 1)

In [382]:
y_train

array([[1],
       [0],
       [1],
       ...,
       [1],
       [0],
       [0]])

In [383]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense

model = Sequential()
model.add(SimpleRNN(units=128, input_shape=(1, 20), activation='relu', kernel_initializer='glorot_uniform',return_sequences=True))
model.add(SimpleRNN(units=64, activation='relu', kernel_initializer='glorot_uniform', return_sequences=True))  # Add return_sequences=True
model.add(SimpleRNN(units=32, activation='relu', kernel_initializer='glorot_uniform'))
model.add(Dense(units=1, activation='sigmoid', kernel_initializer='glorot_uniform'))


  super().__init__(**kwargs)


In [384]:
print(model.summary())


None


In [385]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [386]:
# model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))

In [387]:
y_pred= model.predict(X_test)

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [388]:
loss, accuracy = model.evaluate(X_test, y_test)

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 411us/step - accuracy: 0.4951 - loss: 0.7034


In [389]:
accuracy

0.5032573342323303

In [390]:
model.save('model.h5')




In [391]:
# from keras.models import load_model
# model = load_model('model.h5')


In [392]:
y_pred

array([[0.64590216],
       [0.48441592],
       [0.4296357 ],
       ...,
       [0.51210445],
       [0.5165614 ],
       [0.59349465]], dtype=float32)

In [393]:
y_test

array([0, 0, 0, ..., 0, 0, 1])