In [5]:
import pandas as pd


df_ = pd.read_csv("FakeNews.csv")

df = pd.DataFrame()

df['text'] = df_['text']
df['label'] = df_['subject']
df.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,News
1,House Intelligence Committee Chairman Devin Nu...,News
2,"On Friday, it was revealed that former Milwauk...",News
3,"On Christmas day, Donald Trump announced that ...",News
4,Pope Francis used his annual Christmas Day mes...,News


In [6]:
df.drop_duplicates(inplace=True)

In [11]:
import re
def remove_tags(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower() 

In [16]:
df['text'] = df['text'].map(remove_tags)

In [17]:
df['text']

0        donald trump just couldn t wish all americans ...
1        house intelligence committee chairman devin nu...
2        on friday it was revealed that former milwauke...
3        on christmas day donald trump announced that h...
4        pope francis used his annual christmas day mes...
                               ...                        
23476    21st century wire says as 21wire reported earl...
23477    21st century wire says it s a familiar theme w...
23478    patrick henningsen 21st century wireremember w...
23479    21st century wire says al jazeera america will...
23480    21st century wire says as 21wire predicted in ...
Name: text, Length: 22840, dtype: object

In [None]:
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]



df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in stopwords]).apply(lambda x:" ".join(x))

In [18]:
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [25]:
story = []
for doc in df['text']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [27]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)


In [28]:
model.build_vocab(story)
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(38440041, 48097300)

In [29]:
len(model.wv.index_to_key)

61610

In [31]:
import numpy as np
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [36]:
document_vector(df['text'].values[1])

array([-0.43606296, -1.0358564 ,  0.10183568, -0.07360886, -0.7903454 ,
        0.00634428, -0.62952036, -0.05360719, -0.02226407, -0.41120178,
        0.32399017,  0.03181547, -0.01434001,  0.17220597,  0.13205795,
        0.03065369, -0.93876016,  0.54615337, -0.4942777 , -0.03296059,
       -0.5831057 ,  0.07864317,  0.3320933 , -0.818315  ,  0.2462386 ,
        0.26864332, -0.68359816, -0.7881865 , -0.11453448, -0.5474589 ,
       -0.43625957, -0.5642778 ,  0.20903462,  0.02808282,  0.7275144 ,
        0.20721221, -0.31667146,  0.2538388 , -0.23930463,  1.1477997 ,
       -0.61605746,  0.07699516,  0.18690178,  1.2064406 ,  0.44960484,
        0.3153007 ,  0.06894444,  0.05744451, -0.42010784,  0.01566507,
       -0.73902285,  0.17750819, -0.7501287 ,  0.69363075,  0.9897044 ,
        0.5872572 ,  0.07073455, -0.15474907,  0.59777945,  0.72335845,
        0.19930789,  0.41706693, -0.07126419,  0.34723312,  0.07780281,
       -0.10166198,  0.00390629, -0.26659542, -0.21302088, -0.16

In [39]:
from tqdm import tqdm
X = []
for doc in tqdm(df['text'].values[:100]):
    X.append(document_vector(doc))

100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


In [42]:
X[1]

array([-0.43606296, -1.0358564 ,  0.10183568, -0.07360886, -0.7903454 ,
        0.00634428, -0.62952036, -0.05360719, -0.02226407, -0.41120178,
        0.32399017,  0.03181547, -0.01434001,  0.17220597,  0.13205795,
        0.03065369, -0.93876016,  0.54615337, -0.4942777 , -0.03296059,
       -0.5831057 ,  0.07864317,  0.3320933 , -0.818315  ,  0.2462386 ,
        0.26864332, -0.68359816, -0.7881865 , -0.11453448, -0.5474589 ,
       -0.43625957, -0.5642778 ,  0.20903462,  0.02808282,  0.7275144 ,
        0.20721221, -0.31667146,  0.2538388 , -0.23930463,  1.1477997 ,
       -0.61605746,  0.07699516,  0.18690178,  1.2064406 ,  0.44960484,
        0.3153007 ,  0.06894444,  0.05744451, -0.42010784,  0.01566507,
       -0.73902285,  0.17750819, -0.7501287 ,  0.69363075,  0.9897044 ,
        0.5872572 ,  0.07073455, -0.15474907,  0.59777945,  0.72335845,
        0.19930789,  0.41706693, -0.07126419,  0.34723312,  0.07780281,
       -0.10166198,  0.00390629, -0.26659542, -0.21302088, -0.16

In [46]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['label'][:100])
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [47]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [49]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

1.0