In [1]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_texts = pd.read_csv('./fake_news/train.csv').dropna(subset='text').text
test_texts = pd.read_csv('./fake_news/test.csv').dropna(subset='text').text

In [3]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# Retrieve stopwords from all of the available languages into one set
stop_words = set(sum([stopwords.words(language) for language in stopwords.fileids()], []))

def clean(text):
    words = word_tokenize(text)
    words = [stemmer.stem(w.lower()) for w in words if  # Stem and lower
            not w in stop_words  # Not a stop word
            and w.isalpha()]  # Only contains letters
    
    return " ".join(words)

In [4]:
from tqdm import tqdm
tqdm.pandas()

train_texts = train_texts.progress_apply(clean)
test_texts = test_texts.progress_apply(clean)

100%|████████████████████████████████████| 20761/20761 [02:17<00:00, 150.96it/s]
100%|██████████████████████████████████████| 5193/5193 [00:34<00:00, 152.11it/s]


In [5]:
train_vectorizer = TfidfVectorizer(min_df=0.005, max_df=0.6)
X_train = train_vectorizer.fit_transform(train_texts)

X_test = train_vectorizer.transform(test_texts)

In [6]:
X_train.shape, X_test.shape

((20761, 5551), (5193, 5551))

In [7]:
len_train = X_train.shape[0]
len_test = X_test.shape[0]

combined = np.concatenate([X_train.A, X_test.A])

combined_embedded = TSNE(n_components=2, perplexity=10, init='pca').fit_transform(combined)
train_embedded = combined_embedded[:len_train]
test_embedded = combined_embedded[len_train:]

assert train_embedded.shape[0] == len_train and test_embedded.shape[0] == len_test



In [8]:
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='notebook'

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_embedded[:, 0], y=train_embedded[:, 1], mode='markers'))
fig.add_trace(go.Scatter(x=test_embedded[:, 0], y=test_embedded[:, 1], mode='markers'))