In [111]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Dataset Overview

In [112]:
df = pd.read_csv("/Users/enrique/code/EFRdev/08-Final-Project/SolarSoundBytes/raw_data/guardian_environment_news.csv")
df.head()

Unnamed: 0,Title,Intro Text,Authors,Article Text,Date Published
0,Liz Truss ‘will approve more oil drilling if ...,Tory leadership candidate criticised by campai...,"['Rob Davies', '@ByRobDavies']",Liz Truss will sign off on a push for more oil...,2022-08-30
1,Renewed Highland golf course plan has environm...,Scottish government rejected a new links at Co...,"['Ewan Murray', '@mrewanmurray']",It is an area so tranquil that the notion of b...,2021-03-22
2,Visiting green spaces deters mental health dr...,Positive effects were stronger among those rep...,"['Damien Gayle', '@damiengayle']","Visits to parks, community gardens and other u...",2023-01-17
3,Bought too much red cabbage? Turn it into a fe...,This fantastic vegan centrepiece makes full us...,['Tom Hunt'],"I devised today’s nut roast for Oddbox, a veg ...",2023-12-22
4,‘This year has been very good’: readers’ UK bu...,Readers share their favourite sightings over t...,['Guardian readers'],‘Constant companions to our gardening’A peacoc...,2023-12-19


In [113]:
df.shape

(30059, 5)

# Delete or Impute Nulls Values

In [114]:
print(df.isnull().sum())

Title              948
Intro Text          82
Authors           4570
Article Text       368
Date Published    2441
dtype: int64


In [115]:
df = df.dropna(subset =['Article Text', 'Date Published', 'Authors'])
df.shape

(23404, 5)

In [116]:
df['Title'] = df['Title'].fillna('No Title')
df['Intro Text'] = df['Intro Text'].fillna('No Intro Text')

In [117]:
print(df.isnull().sum())

df.shape

Title             0
Intro Text        0
Authors           0
Article Text      0
Date Published    0
dtype: int64


(23404, 5)

In [118]:
df.head()

Unnamed: 0,Title,Intro Text,Authors,Article Text,Date Published
0,Liz Truss ‘will approve more oil drilling if ...,Tory leadership candidate criticised by campai...,"['Rob Davies', '@ByRobDavies']",Liz Truss will sign off on a push for more oil...,2022-08-30
1,Renewed Highland golf course plan has environm...,Scottish government rejected a new links at Co...,"['Ewan Murray', '@mrewanmurray']",It is an area so tranquil that the notion of b...,2021-03-22
2,Visiting green spaces deters mental health dr...,Positive effects were stronger among those rep...,"['Damien Gayle', '@damiengayle']","Visits to parks, community gardens and other u...",2023-01-17
3,Bought too much red cabbage? Turn it into a fe...,This fantastic vegan centrepiece makes full us...,['Tom Hunt'],"I devised today’s nut roast for Oddbox, a veg ...",2023-12-22
4,‘This year has been very good’: readers’ UK bu...,Readers share their favourite sightings over t...,['Guardian readers'],‘Constant companions to our gardening’A peacoc...,2023-12-19


# Text Cleaning:
**Preprocessing:** lowercase, delete numbers, punctuation and symbols (#"*!&%), splitting, tokenizing?, removing stopwords, lemmatizing

In [123]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("","", string.punctuation))
    text = ''.join(char for char in text if not char.isdigit())

    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

df['Clean Title'] = df['Title'].apply(preprocess_text)
df['Clean Intro Text'] = df['Intro Text'].apply(preprocess_text)
df['Clean Article Text'] = df['Article Text'].apply(preprocess_text)
df['Author_Domain'] = df['Authors'].apply(preprocess_text)
df_clean = df.copy()

df_clean = df.drop(columns=['Title','Intro Text','Article Text', 'Authors'])
df_clean.head()

Unnamed: 0,Date Published,Clean Title,Clean Intro Text,Clean Article Text,Author_Domain
0,2022-08-30,liz truss ‘will approve oil drilling becomes pm’,tory leadership candidate criticised campaigne...,liz truss sign push oil drilling north sea win...,rob davy byrobdavies
1,2021-03-22,renewed highland golf course plan environmenta...,scottish government rejected new link coul pro...,area tranquil notion bitter dispute hugely ano...,ewan murray mrewanmurray
2,2023-01-17,visiting green space deters mental health drug...,positive effect stronger among reporting lowes...,visit park community garden urban green space ...,damien gayle damiengayle
3,2023-12-22,bought much red cabbage turn festive nut roast...,fantastic vegan centrepiece make full use oute...,devised today’s nut roast oddbox veg box outfi...,tom hunt
4,2023-12-19,‘this year good’ readers’ uk butterfly sighting,reader share favourite sighting summer news nu...,‘constant companion gardening’a peacock butter...,guardian reader


In [None]:
df_clean['Date Published'] = pd.to_datetime(df_clean['Date Published'], dayfirst=True, errors='coerce')
print(df_clean['Date Published'].dtype)

datetime64[ns]
datetime64[ns]


  df_clean['Date Published'] = pd.to_datetime(df_clean['Date Published'], dayfirst=True, errors='coerce')


In [124]:
df_clean

Unnamed: 0,Date Published,Clean Title,Clean Intro Text,Clean Article Text,Author_Domain
0,2022-08-30,liz truss ‘will approve oil drilling becomes pm’,tory leadership candidate criticised campaigne...,liz truss sign push oil drilling north sea win...,rob davy byrobdavies
1,2021-03-22,renewed highland golf course plan environmenta...,scottish government rejected new link coul pro...,area tranquil notion bitter dispute hugely ano...,ewan murray mrewanmurray
2,2023-01-17,visiting green space deters mental health drug...,positive effect stronger among reporting lowes...,visit park community garden urban green space ...,damien gayle damiengayle
3,2023-12-22,bought much red cabbage turn festive nut roast...,fantastic vegan centrepiece make full use oute...,devised today’s nut roast oddbox veg box outfi...,tom hunt
4,2023-12-19,‘this year good’ readers’ uk butterfly sighting,reader share favourite sighting summer news nu...,‘constant companion gardening’a peacock butter...,guardian reader
...,...,...,...,...,...
30053,2021-08-23,pacific ocean’s rising acidity cause dungeness...,acidity making shell crab larva vulnerable pre...,pacific ocean becoming acidic starting dissolv...,lauren aratani
30054,2020-11-12,climate change could make insurance expensive ...,munich world’s largest reinsurance firm warns ...,insurer warned climate change could make cover...,arthur neslen
30055,2021-06-05,republican lawmaker pitch carbon tax defiance ...,representative carlos curbelo proposed tax car...,republican lawmaker proposed u introduce tax c...,oliver milman new york olliemilman
30056,2018-04-25,mp call urgent investigation water buyback,crossparty group asks auditor general seek inf...,crossparty group federal mp asked auditor gene...,anne davy
