In [1]:
import pandas as pd

In [2]:
file_path = 'Input/fnn_dev.csv'
df = pd.read_csv(file_path)

In [3]:
print(df.head())
print(df.info())

      id                       date                                   speaker  \
0   1636  2010-03-28T17:45:34-04:00                             Charlie Crist   
1   4352  2011-08-29T06:00:00-04:00                              Bobby  Scott   
2  16471  2019-02-12T17:35:38-05:00  Wisconsin Republican Legislative leaders   
3   1557  2010-03-05T18:24:02-05:00                             Dave Aronberg   
4  12826  2016-07-29T18:09:31-04:00                          Jeannette Vaught   

                                           statement  \
0  Rubio's tax swap proposal "would have been a m...   
1  "The estimated savings of this (debt ceiling) ...   
2  Foxconn has already "made a positive impact ac...   
3  Says Gov. Charlie Crist has called him "a rock...   
4  "Only five Texas counties account for almost 9...   

                                             sources  \
0  ['http://blogs.tampabay.com/buzz/files/040307l...   
1  ['http://www.bobbyscott.house.gov/index.php?op...   
2  ['htt

In [4]:
# Replace empty strings with NaN
df.replace('', pd.NA, inplace=True)
# Drop rows with missing values
df.dropna(inplace=True)

In [5]:
import re
import string
import ast

def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9.,?!:;\'"’%$€£()@\s]+', '', text)
    return text.strip()

df['statement'] = df['statement'].apply(clean_text)
df['paragraph_based_content'] = df['paragraph_based_content'].apply(ast.literal_eval).apply(lambda x: x[0])

df['fullText_based_content'] = df['fullText_based_content'].apply(clean_text)

Gov. Charlie Crist launched what amounts to a nuclear attack in Republican politics during his FOX News Sunday showdown with Marco Rubio.


In [7]:
df.to_csv('output/preprocessed_data_dev.csv', index=False)

In [8]:
df.head(10)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label_fnn
0,1636,2010-03-28T17:45:34-04:00,Charlie Crist,rubios tax swap proposal would have been a mas...,['http://blogs.tampabay.com/buzz/files/040307l...,Gov. Charlie Crist launched what amounts to a ...,gov charlie crist launched what amounts to a n...,fake
1,4352,2011-08-29T06:00:00-04:00,Bobby Scott,the estimated savings of this debt ceiling dea...,['http://www.bobbyscott.house.gov/index.php?op...,"U.S. Rep. Bobby Scott, D-3rd, was not pleased ...",us rep bobby scott drd was not pleased with th...,real
2,16471,2019-02-12T17:35:38-05:00,Wisconsin Republican Legislative leaders,foxconn has already made a positive impact acr...,['https://www.wispolitics.com/2019/sen-fitzger...,Amid reports questioning Foxconn Technology Gr...,amid reports questioning foxconn technology gr...,fake
3,1557,2010-03-05T18:24:02-05:00,Dave Aronberg,says gov charlie crist has called him a rock star,"['http://www.davearonberg.com/about', 'http://...","State Sen. Dave Aronberg, a Democratic candida...",state sen dave aronberg a democratic candidate...,real
4,12826,2016-07-29T18:09:31-04:00,Jeannette Vaught,only five texas counties account for almost pe...,['http://www.mystatesman.com/news/news/opinion...,From the citrus of the Rio Grande Valley to th...,from the citrus of the rio grande valley to th...,fake
5,5073,2012-01-06T13:29:58-05:00,Barack Obama,president barack obama put in place historic f...,['http://www.youtube.com/watch?v=f86IferxohM&f...,With all eyes on Iowa for the Republican caucu...,with all eyes on iowa for the republican caucu...,real
6,8954,2014-02-21T15:28:54-05:00,Dan Branch,worked with ronald reagan and helped reelect him,['https://docs.google.com/document/d/1UbrA4mT-...,Texas attorney general candidate Dan Branch su...,texas attorney general candidate dan branch su...,real
7,6766,2012-09-25T11:52:41-04:00,Secure America Now,says obama adviser david plouffe took large pi...,"['http://www.youtube.com/watch?v=5MvGtLbUSFQ',...",There’s the guilt-by-association attack ad -- ...,there’s the guiltbyassociation attack ad and t...,real
8,11892,2016-02-11T14:21:04-05:00,Seminole Tribe of Florida,the seminole gambling compact offers billion i...,"['https://youtu.be/6trsrsDedQY', 'http://www.f...",The Seminole Tribe of Florida is trying to con...,the seminole tribe of florida is trying to con...,real
9,11287,2015-09-21T18:00:00-04:00,Viral image,says bernie sanders opposes requiring all chil...,['http://www.motherjones.com/politics/2015/05/...,As the Democratic primary race between former ...,as the democratic primary race between former ...,fake


In [9]:

# !python -m spacy download en_core_web_sm

In [10]:
# !python -m pip install spacy==2.1.0
# !python -m pip install neuralcoref

In [11]:
import spacy
import neuralcoref

nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)


def replace_pronouns(text):

    doc = nlp(text)
    return doc._.coref_resolved

test_1 = df.iloc[0]['paragraph_based_content']
print(replace_pronouns(test_1))

# df['resolved_text'] = df['fullText_based_content'].apply(replace_pronouns)

Gov. Charlie Crist launched what amounts to a nuclear attack in Republican politics during Gov. Charlie Crist FOX News Sunday showdown with Marco Rubio.


In [12]:
df.to_csv('output/replaced_data_dev.csv', index=False)
df.head(10)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label_fnn
0,1636,2010-03-28T17:45:34-04:00,Charlie Crist,rubios tax swap proposal would have been a mas...,['http://blogs.tampabay.com/buzz/files/040307l...,Gov. Charlie Crist launched what amounts to a ...,gov charlie crist launched what amounts to a n...,fake
1,4352,2011-08-29T06:00:00-04:00,Bobby Scott,the estimated savings of this debt ceiling dea...,['http://www.bobbyscott.house.gov/index.php?op...,"U.S. Rep. Bobby Scott, D-3rd, was not pleased ...",us rep bobby scott drd was not pleased with th...,real
2,16471,2019-02-12T17:35:38-05:00,Wisconsin Republican Legislative leaders,foxconn has already made a positive impact acr...,['https://www.wispolitics.com/2019/sen-fitzger...,Amid reports questioning Foxconn Technology Gr...,amid reports questioning foxconn technology gr...,fake
3,1557,2010-03-05T18:24:02-05:00,Dave Aronberg,says gov charlie crist has called him a rock star,"['http://www.davearonberg.com/about', 'http://...","State Sen. Dave Aronberg, a Democratic candida...",state sen dave aronberg a democratic candidate...,real
4,12826,2016-07-29T18:09:31-04:00,Jeannette Vaught,only five texas counties account for almost pe...,['http://www.mystatesman.com/news/news/opinion...,From the citrus of the Rio Grande Valley to th...,from the citrus of the rio grande valley to th...,fake
5,5073,2012-01-06T13:29:58-05:00,Barack Obama,president barack obama put in place historic f...,['http://www.youtube.com/watch?v=f86IferxohM&f...,With all eyes on Iowa for the Republican caucu...,with all eyes on iowa for the republican caucu...,real
6,8954,2014-02-21T15:28:54-05:00,Dan Branch,worked with ronald reagan and helped reelect him,['https://docs.google.com/document/d/1UbrA4mT-...,Texas attorney general candidate Dan Branch su...,texas attorney general candidate dan branch su...,real
7,6766,2012-09-25T11:52:41-04:00,Secure America Now,says obama adviser david plouffe took large pi...,"['http://www.youtube.com/watch?v=5MvGtLbUSFQ',...",There’s the guilt-by-association attack ad -- ...,there’s the guiltbyassociation attack ad and t...,real
8,11892,2016-02-11T14:21:04-05:00,Seminole Tribe of Florida,the seminole gambling compact offers billion i...,"['https://youtu.be/6trsrsDedQY', 'http://www.f...",The Seminole Tribe of Florida is trying to con...,the seminole tribe of florida is trying to con...,real
9,11287,2015-09-21T18:00:00-04:00,Viral image,says bernie sanders opposes requiring all chil...,['http://www.motherjones.com/politics/2015/05/...,As the Democratic primary race between former ...,as the democratic primary race between former ...,fake


In [13]:
# df['resolved_text'] = df['fullText_based_content'].apply(replace_pronouns)

# def extract_triples(doc):
#     triples = []
#     for token in doc:
#         if "subj" in token.dep_:
#             subj = token
#             pred = token.head
#             for child in pred.children:
#                 if "obj" in child.dep_:
#                     obj = child
#                     triples.append((subj, pred, obj))
#     return triples

# df['triples'] = df['fullText_based_content'].apply(lambda text: extract_triples(nlp(text)))