# Comparing Texts

## Imports

In [12]:
# packages
import pandas as pd
import nltk
import spacy

In [2]:
# load data
df = pd.read_csv('Data/Fake_Real_News_Data.csv')

In [3]:
# preview data
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,A whirlwind day in D.C. showcases Trump’s unor...,Donald Trump endorsed an unabashedly noninterv...,REAL
1,1,"In Baltimore's call for federal police probe, ...",While some Justice Department investigations a...,REAL
2,2,Trump Proudly Declares: Most Of The People I’v...,Trump Proudly Declares: Most Of The People I’v...,FAKE
3,3,Inside the Trump-Bush melodrama: Decades of te...,Donald Trump spent a day in January 2014 hobno...,REAL
4,4,Shutdown clash to return in force by December,Notable names include Ray Washburne (Commerce)...,REAL


## EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [5]:
# check for missing values
df.isna().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [6]:
# check for duplicates
df.duplicated().sum()

0

## 1. Clean the Data

In [7]:
# drop unnecessary columns
df = df.drop(columns='Unnamed: 0')

In [8]:
# verify changes
df.head()

Unnamed: 0,title,text,label
0,A whirlwind day in D.C. showcases Trump’s unor...,Donald Trump endorsed an unabashedly noninterv...,REAL
1,"In Baltimore's call for federal police probe, ...",While some Justice Department investigations a...,REAL
2,Trump Proudly Declares: Most Of The People I’v...,Trump Proudly Declares: Most Of The People I’v...,FAKE
3,Inside the Trump-Bush melodrama: Decades of te...,Donald Trump spent a day in January 2014 hobno...,REAL
4,Shutdown clash to return in force by December,Notable names include Ray Washburne (Commerce)...,REAL


In [9]:
df['lower_text'] = df['text'].str.lower()
df.head()

Unnamed: 0,title,text,label,lower_text
0,A whirlwind day in D.C. showcases Trump’s unor...,Donald Trump endorsed an unabashedly noninterv...,REAL,donald trump endorsed an unabashedly noninterv...
1,"In Baltimore's call for federal police probe, ...",While some Justice Department investigations a...,REAL,while some justice department investigations a...
2,Trump Proudly Declares: Most Of The People I’v...,Trump Proudly Declares: Most Of The People I’v...,FAKE,trump proudly declares: most of the people i’v...
3,Inside the Trump-Bush melodrama: Decades of te...,Donald Trump spent a day in January 2014 hobno...,REAL,donald trump spent a day in january 2014 hobno...
4,Shutdown clash to return in force by December,Notable names include Ray Washburne (Commerce)...,REAL,notable names include ray washburne (commerce)...


## 2. Prepare the Data

### Create Tokens

In [10]:
df['tokens'] = df['lower_text'].str.split()
df.head()

Unnamed: 0,title,text,label,lower_text,tokens
0,A whirlwind day in D.C. showcases Trump’s unor...,Donald Trump endorsed an unabashedly noninterv...,REAL,donald trump endorsed an unabashedly noninterv...,"[donald, trump, endorsed, an, unabashedly, non..."
1,"In Baltimore's call for federal police probe, ...",While some Justice Department investigations a...,REAL,while some justice department investigations a...,"[while, some, justice, department, investigati..."
2,Trump Proudly Declares: Most Of The People I’v...,Trump Proudly Declares: Most Of The People I’v...,FAKE,trump proudly declares: most of the people i’v...,"[trump, proudly, declares:, most, of, the, peo..."
3,Inside the Trump-Bush melodrama: Decades of te...,Donald Trump spent a day in January 2014 hobno...,REAL,donald trump spent a day in january 2014 hobno...,"[donald, trump, spent, a, day, in, january, 20..."
4,Shutdown clash to return in force by December,Notable names include Ray Washburne (Commerce)...,REAL,notable names include ray washburne (commerce)...,"[notable, names, include, ray, washburne, (com..."


### Create Lemmas

In [15]:
# load spacy model with parser and ner disabled
nlp_model = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# verify parser and ner not loaded
nlp_model.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']

In [29]:
## [V1 List Comp] Define a function to use spacy to process our text
def spacy_process(text):
        """Lemmatize tokens, lower case, remove punctuation, spaces, and stop words"""
        doc = nlp_model(text)
        processed_doc = [token.lemma_.lower() for token in doc if not token.is_punct and 
                         not token.is_space and not token.is_stop and 
                         not 'http' in token.lemma_.lower() and 'www' not in token.lemma_.lower()]
        return processed_doc

In [30]:
## process the text using the spacy function
df['spacy_lemmas'] = df['text'].apply(spacy_process)
df.head()

Unnamed: 0,title,text,label,lower_text,tokens,spacy_lemmas
0,A whirlwind day in D.C. showcases Trump’s unor...,Donald Trump endorsed an unabashedly noninterv...,REAL,donald trump endorsed an unabashedly noninterv...,"[donald, trump, endorsed, an, unabashedly, non...","[donald, trump, endorse, unabashedly, noninter..."
1,"In Baltimore's call for federal police probe, ...",While some Justice Department investigations a...,REAL,while some justice department investigations a...,"[while, some, justice, department, investigati...","[justice, department, investigation, adversari..."
2,Trump Proudly Declares: Most Of The People I’v...,Trump Proudly Declares: Most Of The People I’v...,FAKE,trump proudly declares: most of the people i’v...,"[trump, proudly, declares:, most, of, the, peo...","[trump, proudly, declare, people, insult, dese..."
3,Inside the Trump-Bush melodrama: Decades of te...,Donald Trump spent a day in January 2014 hobno...,REAL,donald trump spent a day in january 2014 hobno...,"[donald, trump, spent, a, day, in, january, 20...","[donald, trump, spend, day, january, 2014, hob..."
4,Shutdown clash to return in force by December,Notable names include Ray Washburne (Commerce)...,REAL,notable names include ray washburne (commerce)...,"[notable, names, include, ray, washburne, (com...","[notable, name, include, ray, washburne, comme..."
