In [54]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
import re
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tqdm import tqdm
from tensorflow import keras

In [2]:
stemmer = WordNetLemmatizer()
sw = stopwords.words('english')

In [3]:
data = pd.read_csv('train.csv').drop(columns=['id','title','author'])

In [4]:
data.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
data.shape

(20800, 2)

In [6]:
data.isnull().sum()

text     39
label     0
dtype: int64

In [7]:
data.dropna(inplace=True)

In [8]:
def preprocess(x):
    
    # x = '' if pd.isna(x) else x # apply this only you have missing values in data
    # Replacing Spacial Charactors
    text = re.sub(r'\W',' ',x)
    
    # Replacing Other words like .,__ etc...
    text = re.sub(r'[^a-zA-Z0-9]',' ',x)
    # Replacing all single Charactors
    text = re.sub(r'\s+[a-zA-Z]\s+',' ',text)
    
    # Replacing single Charactors from start
    text = re.sub(r'\^[a-zA-Z]\s+',' ',text)
    
    # Replacing Multiple space into single space
    text = re.sub(r'\s+',' ',text,flags=re.I) 
    
    text = text.lower().split()
    
    text = [stemmer.lemmatize(i) for i in text if i not in sw]
    
    text = ' '.join(text)
    
    return text

In [9]:
data.text = data.text.apply(preprocess)

In [10]:
data.head(3)

Unnamed: 0,text,label
0,house dem aide even see comey letter jason cha...,1
1,ever get feeling life circle roundabout rather...,0
2,truth might get fired october 29 2016 tension ...,1


In [11]:
x_train,x_test,y_train,y_test = train_test_split(data.text,data.label,random_state=42,test_size=0.2)

In [12]:
x_train

8252     burlingame calif populous state nation preside...
11551    larry colburn became american hero intervened ...
10003    graphic clinton v trump donkeyhotey paul jay i...
11305    chart day bond vigilante back ust yield 65bps ...
17747    istanbul lawmaker president recep tayyip erdog...
                               ...                        
11307    islamic state back another video even depraved...
11989    post originally published site first appeared ...
5401     sent thursday october 1 2015 7 20 pm robby moo...
862      aboard wavertree kill van kull 131 year since ...
15828    archive michael television barack obama delay ...
Name: text, Length: 16608, dtype: object

## Using CountVector

In [13]:
cv = CountVectorizer(max_features=10000,min_df=5,max_df=0.7,stop_words=stopwords.words('english'))

In [14]:
cv_x_train = cv.fit_transform(x_train)
cv_x_test = cv.transform(x_test)

In [15]:
cv_x_train = cv_x_train.toarray()
cv_x_test = cv_x_test.toarray()

In [16]:
cv_x_train.shape

(16608, 10000)

In [17]:
cv.get_feature_names_out()[:100]

array(['00', '000', '01', '02', '03', '04', '05', '07', '08', '09', '10',
       '100', '1000', '10th', '11', '110', '11th', '12', '120', '125',
       '13', '130', '14', '140', '14th', '15', '150', '15th', '16', '160',
       '16th', '17', '170', '18', '180', '18th', '19', '1930s', '1938',
       '1941', '1945', '1947', '1948', '1949', '1950', '1950s', '1956',
       '1960', '1960s', '1962', '1963', '1964', '1965', '1966', '1967',
       '1968', '1969', '1970', '1970s', '1971', '1972', '1973', '1974',
       '1975', '1976', '1977', '1978', '1979', '1980', '1980s', '1981',
       '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989',
       '1990', '1990s', '1991', '1992', '1993', '1994', '1995', '1996',
       '1997', '1998', '1999', '19th', '1st', '20', '200', '2000',
       '2000s', '2001', '2002', '2003', '2004'], dtype=object)

In [18]:
pd.DataFrame(cv_x_train,columns=cv.get_feature_names_out())

Unnamed: 0,00,000,01,02,03,04,05,07,08,09,...,zealand,zero,zika,zionist,zombie,zone,zoo,zu,zuckerberg,zulu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16603,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16604,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,1,0,0,0,0
16605,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Using TF-IDF

In [19]:
tf = TfidfVectorizer(max_features=10000,min_df=5,max_df=0.7,stop_words=stopwords.words('english'))

In [20]:
tf_x_train = tf.fit_transform(x_train)
tf_x_test = tf.transform(x_test)

In [21]:
tf_x_train = tf_x_train.toarray()
tf_x_test = tf_x_test.toarray()

In [22]:
tf_x_train.shape

(16608, 10000)

In [23]:
tf.get_feature_names_out()[:100]

array(['00', '000', '01', '02', '03', '04', '05', '07', '08', '09', '10',
       '100', '1000', '10th', '11', '110', '11th', '12', '120', '125',
       '13', '130', '14', '140', '14th', '15', '150', '15th', '16', '160',
       '16th', '17', '170', '18', '180', '18th', '19', '1930s', '1938',
       '1941', '1945', '1947', '1948', '1949', '1950', '1950s', '1956',
       '1960', '1960s', '1962', '1963', '1964', '1965', '1966', '1967',
       '1968', '1969', '1970', '1970s', '1971', '1972', '1973', '1974',
       '1975', '1976', '1977', '1978', '1979', '1980', '1980s', '1981',
       '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989',
       '1990', '1990s', '1991', '1992', '1993', '1994', '1995', '1996',
       '1997', '1998', '1999', '19th', '1st', '20', '200', '2000',
       '2000s', '2001', '2002', '2003', '2004'], dtype=object)

In [24]:
pd.DataFrame(tf_x_train,columns=tf.get_feature_names_out())

Unnamed: 0,00,000,01,02,03,04,05,07,08,09,...,zealand,zero,zika,zionist,zombie,zone,zoo,zu,zuckerberg,zulu
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.014817,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.032196,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16603,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
16604,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.094398,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.022195,0.0,0.0,0.0,0.0
16605,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
16606,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0


## Models

In [35]:
nv_model = MultinomialNB()

In [36]:
nv_model.fit(cv_x_train,y_train)

In [37]:
nv_pred = nv_model.predict(cv_x_test)
nv_pred

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [38]:
accuracy_score(y_test,nv_pred)

0.8930893330122803

In [39]:
confusion_matrix(y_test,nv_pred)

array([[1923,  156],
       [ 288, 1786]], dtype=int64)

## Word to Vect

In [25]:
text = data.text
text

0        house dem aide even see comey letter jason cha...
1        ever get feeling life circle roundabout rather...
2        truth might get fired october 29 2016 tension ...
3        video 15 civilian killed single u airstrike id...
4        print iranian woman sentenced six year prison ...
                               ...                        
20795    rapper unloaded black celebrity met donald tru...
20796    green bay packer lost washington redskin week ...
20797    macy today grew union several great name ameri...
20798    nato russia hold parallel exercise balkan 11 0...
20799    david swanson author activist journalist radio...
Name: text, Length: 20761, dtype: object

In [26]:
story = []
for doc in text:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [27]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [28]:
model.build_vocab(story)

In [29]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(41735558, 42953530)

In [31]:
def document_vector(doc):
    words = doc.split()
    
    # Filter out-of-vocabulary words
    valid_words = [word for word in words if word in model.wv.index_to_key]
    
    if valid_words:
        # Calculate the mean vector of valid words
        word_vectors = model.wv[valid_words]
        doc_vector = np.mean(word_vectors, axis=0)
    else:
        # If no valid words found, return a vector of zeros or None
        doc_vector = np.zeros(model.vector_size)  # You can replace this with None if needed
    
    return doc_vector

In [33]:
q1 = []
for doc in tqdm(text):
    q1.append(document_vector(doc))

100%|████████████████████████████████████████████████████████████████████████████| 20761/20761 [48:23<00:00,  7.15it/s]


In [34]:
q1 = np.array(q1)

In [35]:
q1.shape

(20761, 100)

In [38]:
que_df

0        house dem aide even see comey letter jason cha...
1        ever get feeling life circle roundabout rather...
2        truth might get fired october 29 2016 tension ...
3        video 15 civilian killed single u airstrike id...
4        print iranian woman sentenced six year prison ...
                               ...                        
20795    rapper unloaded black celebrity met donald tru...
20796    green bay packer lost washington redskin week ...
20797    macy today grew union several great name ameri...
20798    nato russia hold parallel exercise balkan 11 0...
20799    david swanson author activist journalist radio...
Name: text, Length: 20761, dtype: object

In [39]:
temp_df1 = pd.DataFrame(q1, index= data.index)

In [41]:
document_vector(text[0])

array([ 0.49478862, -0.302423  ,  0.429623  ,  0.01865009, -0.96686417,
       -0.4138304 , -0.3887836 , -0.4908528 , -0.23039016, -0.7656233 ,
        0.9079615 , -0.3180682 ,  0.29101834,  0.26120093, -0.250335  ,
        0.47371292, -0.16211098,  0.5432039 , -0.7683077 , -0.18642822,
       -0.05595333, -0.74448234, -0.6935724 ,  0.27768853, -0.5332352 ,
       -0.13691632,  0.63814956, -0.16445614,  0.2934434 ,  0.7987343 ,
       -0.11401265,  0.32049844, -0.5431226 , -0.25932986, -0.3700221 ,
        0.45988414, -0.4277896 ,  0.46125045,  0.02800129, -0.67607284,
       -0.5130268 ,  0.35209924,  0.05720324,  0.2692343 , -0.37331724,
       -0.07484289,  0.25749704,  0.3386472 , -0.687827  ,  0.44564804,
       -0.40468723,  1.0144126 , -0.39090338, -0.68119866, -0.15328717,
       -0.40388885,  0.46214375, -0.45711645, -0.09933129, -0.2336541 ,
       -0.7312563 , -0.967833  , -0.07792701, -0.4222369 , -0.4036763 ,
        0.49332437,  0.01245863,  0.5718859 , -0.16172634, -0.99

In [40]:
temp_df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.494789,-0.302423,0.429623,0.018650,-0.966864,-0.413830,-0.388784,-0.490853,-0.230390,-0.765623,...,0.230736,0.470250,0.333165,-0.163840,0.389802,-0.562486,-0.457644,0.156623,-0.031166,-0.350906
1,0.513003,-0.219893,0.404607,0.324979,-0.565379,-0.051649,-0.391769,-0.302650,-0.310227,-0.158779,...,0.001334,0.238065,0.474460,0.305389,-0.029297,-0.608560,-0.811727,0.361936,-0.299398,-0.123171
2,0.819297,-0.168946,0.087356,-0.316947,-0.488442,-0.326802,-0.254053,-0.468052,-0.024118,0.023371,...,-0.182482,0.320576,0.483572,-0.160432,0.059734,-0.217576,-0.785027,0.597715,-0.264076,0.068109
3,0.773014,0.932948,1.106285,0.797277,-0.264345,-0.015394,0.305031,-1.086351,0.362705,0.231656,...,0.053455,0.630204,-0.051507,-0.194856,0.132259,0.595012,-0.351950,1.082815,-0.989952,0.454484
4,0.585783,0.079164,0.003329,0.319184,-0.290193,0.007421,-0.411164,-0.694017,-0.234597,-0.095254,...,0.672603,0.210838,-0.431312,0.374656,0.123497,0.780169,-0.425646,0.336385,0.230866,-0.080855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20795,0.257110,0.342894,-0.078488,0.228383,0.001806,-0.032652,-0.172270,-0.242417,-0.152333,-0.160008,...,-0.288738,0.157427,0.403813,0.204415,-0.135519,-0.331106,-0.628368,0.471706,-0.309800,0.189154
20796,-0.281050,0.330436,0.955970,1.321458,-0.417251,0.174004,0.393378,-0.737300,-0.052090,0.036783,...,0.356897,0.775484,0.319839,0.029641,0.591327,-0.199048,-0.739630,0.617135,-0.411584,0.306718
20797,0.314279,-0.528054,0.012097,1.160629,0.334867,-0.180665,0.003384,-0.906662,-0.848989,0.724456,...,0.034567,0.601732,0.355051,0.378449,0.588359,-0.016890,-0.538906,0.296015,-0.610386,0.264423
20798,0.624757,0.287054,0.712965,0.034499,0.104829,-0.424703,-0.393076,-0.772834,0.879641,0.139461,...,-0.757814,0.715026,0.121065,-0.262944,0.473064,-0.558971,-0.526690,1.128407,-0.383120,-0.023099


In [43]:
final_df = pd.concat([data, temp_df1], axis=1)

In [44]:
final_df.shape

(20761, 102)

In [82]:
final_df.head()

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,1,0.494789,-0.302423,0.429623,0.01865,-0.966864,-0.41383,-0.388784,-0.490853,-0.23039,...,0.230736,0.47025,0.333165,-0.16384,0.389802,-0.562486,-0.457644,0.156623,-0.031166,-0.350906
1,0,0.513003,-0.219893,0.404607,0.324979,-0.565379,-0.051649,-0.391769,-0.30265,-0.310227,...,0.001334,0.238065,0.47446,0.305389,-0.029297,-0.60856,-0.811727,0.361936,-0.299398,-0.123171
2,1,0.819297,-0.168946,0.087356,-0.316947,-0.488442,-0.326802,-0.254053,-0.468052,-0.024118,...,-0.182482,0.320576,0.483572,-0.160432,0.059734,-0.217576,-0.785027,0.597715,-0.264076,0.068109
3,1,0.773014,0.932948,1.106285,0.797277,-0.264345,-0.015394,0.305031,-1.086351,0.362705,...,0.053455,0.630204,-0.051507,-0.194856,0.132259,0.595012,-0.35195,1.082815,-0.989952,0.454484
4,1,0.585783,0.079164,0.003329,0.319184,-0.290193,0.007421,-0.411164,-0.694017,-0.234597,...,0.672603,0.210838,-0.431312,0.374656,0.123497,0.780169,-0.425646,0.336385,0.230866,-0.080855


In [46]:
final_df.drop('text',axis=1,inplace=True)

In [62]:
x_train,x_test,y_train,y_test = train_test_split(final_df.drop('label',axis=1),final_df['label'],test_size=0.2,random_state=1)

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.893570912593306

In [52]:
confusion_matrix(y_test,y_pred)

array([[1906,  209],
       [ 233, 1805]], dtype=int64)

In [63]:
x_train.shape

(16608, 100)

In [80]:
x_test.shape

(4153, 100)

In [58]:
len(model.wv.key_to_index)

81328

In [89]:
model = Sequential()
model.add(LSTM(512,input_shape=(100,1)))
model.add(Dense(256,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [90]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_10 (LSTM)              (None, 512)               1052672   
                                                                 
 dense_8 (Dense)             (None, 256)               131328    
                                                                 
 dense_9 (Dense)             (None, 1)                 257       
                                                                 
Total params: 1184257 (4.52 MB)
Trainable params: 1184257 (4.52 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [91]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [92]:
model.fit(x_train,y_train,epochs=100,validation_data=(x_test,y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
 41/519 [=>............................] - ETA: 8:24 - loss: 0.3226 - accuracy: 0.8590

KeyboardInterrupt: 

In [83]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
5432,0.319345,0.216051,0.488394,0.769706,-0.074444,0.322269,0.023629,-0.751429,-0.325741,-0.158208,...,0.153516,0.49729,-0.12305,0.100872,0.656913,-0.169846,-0.576972,0.452521,-0.105086,-0.046068
14615,0.311759,0.917897,0.088555,0.587045,-0.184994,-0.305074,-0.327234,-0.058998,0.093634,0.173844,...,0.284934,-0.104547,0.006587,0.291495,-0.238838,-0.148068,-0.434172,0.459087,-0.49419,0.251719
12355,0.735314,-0.529947,0.450254,0.556916,-1.293685,-0.731561,0.191941,-0.383599,-0.639428,-0.870939,...,-0.03942,0.914096,0.99261,-0.124573,0.121101,-1.44606,-0.466249,0.680397,-0.100907,-0.763873
2686,0.331402,-0.031913,0.126572,0.181878,0.122649,-0.144408,0.082146,-0.184459,-0.284526,-0.253955,...,-0.217537,0.164581,-0.105804,0.418111,0.619561,0.056181,-0.46944,0.371183,0.112078,-0.102247
730,0.879149,0.252238,-0.047569,-0.289922,-0.280536,-0.076051,-0.229166,-0.730744,0.109512,0.269428,...,-0.974169,0.913256,0.707616,-0.654427,-0.014254,-0.776108,-0.994772,1.529997,-0.53282,0.012829


## Deep Learning Models

In [71]:
model_1 = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(10000,)),  # Adjust the number of units as needed
    keras.layers.Dropout(0.5),  # Optional dropout layer for regularization
    keras.layers.Dense(64, activation='relu'),  # Additional layers can be added as needed
    keras.layers.Dropout(0.5),  # Optional dropout layer for regularization
    keras.layers.Dense(1, activation='sigmoid')  # Binary classification output (0 or 1)
])


In [72]:
model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [73]:
model_1.fit(cv_x_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x214e2c71310>

In [77]:
model_2 = keras.Sequential([
    keras.layers.Embedding(input_dim=10000, output_dim=256, input_length=10000),
    keras.layers.GlobalAveragePooling1D(),  # Use GlobalAveragePooling1D instead of Flatten for variable-length input
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation='sigmoid')
])

In [78]:
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [79]:
model_2.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 10000, 256)        2560000   
                                                                 
 global_average_pooling1d_4  (None, 256)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_24 (Dense)            (None, 128)               32896     
                                                                 
 dropout_16 (Dropout)        (None, 128)               0         
                                                                 
 dense_25 (Dense)            (None, 64)                8256      
                                                                 
 dropout_17 (Dropout)        (None, 64)                0         
                                                      

In [81]:
model_2.fit(cv_x_train, y_train, epochs=5, batch_size=64, validation_split=0.2)