In [1]:
import os
import ssl

# Set the NLTK_DATA environment variable to your provided path
os.environ["NLTK_DATA"] = "/Users/craigroberts/Documents/Coding/NLP/MediScan_NLP_Proj/nltk_data"

# Add this directory to nltk's search path
import nltk
nltk.data.path.append("/Users/craigroberts/Documents/Coding/NLP/MediScan_NLP_Proj/nltk_data")

# Optionally, disable SSL verification to avoid certificate errors when downloading (if needed)
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Import additional libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import html
import re

from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
from langdetect import detect

sns.set(style="whitegrid")

# Download necessary NLTK resources (this should use your provided nltk_data path)
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

print("Setup complete. NLTK data path set to:", os.environ["NLTK_DATA"])

Setup complete. NLTK data path set to: /Users/craigroberts/Documents/Coding/NLP/MediScan_NLP_Proj/nltk_data


[nltk_data] Downloading package averaged_perceptron_tagger_eng to /Use
[nltk_data]     rs/craigroberts/Documents/Coding/NLP/MediScan_NLP_Proj
[nltk_data]     /nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/craigroberts/Docum
[nltk_data]     ents/Coding/NLP/MediScan_NLP_Proj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load train and test TSV files from the PUBHEALTH folder
df_1 = pd.read_csv("PUBHEALTH/train.tsv", sep="\t")
df_2 = pd.read_csv("PUBHEALTH/test.tsv", sep="\t")
df_3 = pd.read_csv("PUBHEALTH/dev.tsv", sep="\t")

# Combine the DataFrames
frames = [df_1, df_2, df_3]
result = pd.concat(frames)

In [3]:
print("Combined DataFrame shape:", result.shape)

Combined DataFrame shape: (12288, 10)


In [4]:
print("First 5 rows:")
result.head()

First 5 rows:


Unnamed: 0.1,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects,Unnamed: 0
0,15661,"""The money the Clinton Foundation took from fr...","April 26, 2015","""Gingrich said the Clinton Foundation """"took m...",Katie Sanders,"""Hillary Clinton is in the political crosshair...",https://www.wsj.com/articles/clinton-foundatio...,false,"Foreign Policy, PunditFact, Newt Gingrich,",
1,9893,Annual Mammograms May Have More False-Positives,"October 18, 2011",This article reports on the results of a study...,,While the financial costs of screening mammogr...,,mixture,"Screening,WebMD,women's health",
2,11358,SBRT Offers Prostate Cancer Patients High Canc...,"September 28, 2016",This news release describes five-year outcomes...,"Mary Chris Jaklevic,Steven J. Atlas, MD, MPH,K...",The news release quotes lead researcher Robert...,https://www.healthnewsreview.org/wp-content/up...,mixture,"Association/Society news release,Cancer",
3,10166,"Study: Vaccine for Breast, Ovarian Cancer Has ...","November 8, 2011","While the story does many things well, the ove...",,"The story does discuss costs, but the framing ...",http://clinicaltrials.gov/ct2/results?term=can...,true,"Cancer,WebMD,women's health",
4,11276,Some appendicitis cases may not require ’emerg...,"September 20, 2010",We really don’t understand why only a handful ...,,"""Although the story didn’t cite the cost of ap...",,true,,


In [5]:
print("Summary statistics:")
result.describe()

Summary statistics:


Unnamed: 0.1,Unnamed: 0
count,1235.0
mean,617.0
std,356.6581
min,0.0
25%,308.5
50%,617.0
75%,925.5
max,1234.0


In [6]:
print("Missing values per column:")
result.isna().sum()

Missing values per column:


claim_id              0
claim                10
date_published     2408
explanation          10
fact_checkers        15
main_text            30
sources              33
label                35
subjects             37
Unnamed: 0        11053
dtype: int64

In [7]:
# Identify indices for labels to drop: 'unproven', 'snopes', and 'mixture'
index_unproven = result[result["label"] == 'unproven'].index
index_snopes   = result[result["label"] == 'snopes'].index
index_mixture  = result[result["label"] == 'mixture'].index

# Combine all indices into one index object
indices_to_drop = index_unproven.union(index_snopes).union(index_mixture)

# Drop these rows from the DataFrame
result.drop(indices_to_drop, inplace=True)
print("Shape after dropping unwanted labels:", result.shape)

Shape after dropping unwanted labels: (9146, 10)


In [8]:
# Create a DataFrame with selected columns: claim_id, claim, main_text, label
dframe = result[["claim_id", "claim", "main_text", "label"]].copy()

# Print missing values before dropping
print("Missing values before dropna:")
dframe.isna().sum()

Missing values before dropna:


claim_id      0
claim         9
main_text    28
label        32
dtype: int64

In [9]:
# Drop rows with missing values
dframe = dframe.dropna()

# Replace 'false' with 0 and 'true' with 1 in the 'label' column
dframe['label'].replace(to_replace=['false', 'true'], value=[0, 1], inplace=True)

# Reset the DataFrame index
dframe = dframe.reset_index(drop=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dframe['label'].replace(to_replace=['false', 'true'], value=[0, 1], inplace=True)
  dframe['label'].replace(to_replace=['false', 'true'], value=[0, 1], inplace=True)


In [10]:
# Optionally inspect a sample value
print("Sample value (row 15, second-to-last column):", dframe.iloc[15, -2])

# Save the processed DataFrame to CSV
dframe.to_csv("initial_file.csv", index=False)

Sample value (row 15, second-to-last column): On Thursday, the United States laid out its objectives for a post-Brexit trade deal with Britain, seeking to entirely eliminate or reduce barriers for U.S. agricultural products and streamline regulatory differences. Opponents of Brexit have argued that such a U.S. accord would open up Britain’s markets to the likes of chlorine-treated chicken and genetically modified crops, while ministers have said the government would not lower food standards to win trade deals. “You have been presented with a false choice: either stick to EU directives, or find yourselves flooded with American food of the lowest quality,” Ambassador Woody Johnson wrote in the Daily Telegraph newspaper. “Inflammatory and misleading terms like ‘chlorinated chicken’ and ‘hormone beef’ are deployed to cast American farming in the worst possible light. It is time the myths are called out for what they really are: a smear campaign from people with their own protectionist agen

In [11]:
# Read the CSV back to verify
df = pd.read_csv("initial_file.csv")
print("Missing values in loaded CSV:")
df.isna().sum()

Missing values in loaded CSV:


claim_id     0
claim        0
main_text    0
label        0
dtype: int64

In [12]:
# Define text cleaning functions
def simplify_text(text):
    return html.unescape(" ".join(text.split()))

def lower_sentences(text):
    return text.lower()

In [13]:
# Apply the cleaning functions to create a new processed column 'claim-p'
df["claim-p"] = df["claim"].map(lower_sentences)
df["claim-p"] = df["claim-p"].map(simplify_text)

print("Sample cleaned text:")
df["claim-p"].head()

Sample cleaned text:


0    study: vaccine for breast, ovarian cancer has ...
1    angioplasty through the wrist backed by new study
2    u.s. says results encouraging for healthcare d...
3    opossums kill thousands of ticks each week, in...
4    democrats hoping to flip house not just trash-...
Name: claim-p, dtype: object

In [14]:
# Define a function to detect language
def detect_comment_lang(comment):
    # Remove mentions, URLs, etc.
    comment = ' '.join(re.sub("(@[A-Za-z0-9_\-]+ )|(https?:\/\/.* )|(www\..* )", " ", comment).split())
    try:
        lang = detect(comment)
    except Exception as e:
        lang = ""
    return lang

  comment = ' '.join(re.sub("(@[A-Za-z0-9_\-]+ )|(https?:\/\/.* )|(www\..* )", " ", comment).split())


In [15]:
# Apply language detection on cleaned claim text
df["lang"] = df["claim-p"].map(detect_comment_lang)

# Print language counts
print("Detected language counts:")
df["lang"].value_counts()

Detected language counts:


lang
en    8897
fr      47
ca      36
it      26
af      22
da      17
ro      12
nl      12
es       9
no       6
tl       4
id       4
sv       4
cy       3
sl       2
lt       2
lv       2
hr       2
de       2
sq       2
et       2
fi       1
Name: count, dtype: int64

In [16]:
# Filter to keep only English entries
df = df[df["lang"] == "en"]
df.index = pd.RangeIndex(0, len(df.index))
df = df.drop(["lang"], axis=1)

# Drop "Unnamed: 0" column if it exists
if "Unnamed: 0" in df.columns:
    df = df.drop(["Unnamed: 0"], axis=1)

print("Shape after filtering for English:", df.shape)
df.to_csv("cleaned_file.csv", index=False)


Shape after filtering for English: (8897, 5)


In [17]:
# Initialize a RegexpTokenizer to tokenize words
tokenizer = RegexpTokenizer(r'\w+')

# Define a function to tokenize and POS tag text
def tokenize_postag(text):
    tokenized = tokenizer.tokenize(text)
    postagged = pos_tag(tokenized)
    return postagged

In [18]:
# Apply tokenization and POS tagging to the processed text
df["postagged"] = df["claim-p"].apply(tokenize_postag)

print("Sample of POS-tagged text:")
df["postagged"].head()

Sample of POS-tagged text:


0    [(study, NN), (vaccine, NN), (for, IN), (breas...
1    [(angioplasty, NN), (through, IN), (the, DT), ...
2    [(u, JJ), (s, NN), (says, VBZ), (results, NNS)...
3    [(opossums, NNS), (kill, VB), (thousands, NNS)...
4    [(democrats, NNS), (hoping, VBG), (to, TO), (f...
Name: postagged, dtype: object

In [19]:
df.to_csv("final_cleaned_file.csv", index=False)
print("Final cleaned file saved as 'final_cleaned_file.csv'")

Final cleaned file saved as 'final_cleaned_file.csv'
