In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re

In [13]:
# Code to download nltk packages
"""
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()
"""

'\nimport nltk\nimport ssl\n\ntry:\n    _create_unverified_https_context = ssl._create_unverified_context\nexcept AttributeError:\n    pass\nelse:\n    ssl._create_default_https_context = _create_unverified_https_context\n\nnltk.download()\n'

## Text Cleaning (preprocessing)

In [14]:
#Dataset to clean
dataset = pd.read_csv('Datasets/SentimentAnalysis.csv')

textColumn = 'Headline' #Change this depending on dataset

dataset.head()

Unnamed: 0.1,Unnamed: 0,Headline,Sentiment
0,0,Australia news LIVE: Census 2021 results relea...,-0.210526
1,1,ASX opens up BWX dives 38pc Collins Foods Tass...,0.0
2,2,Movie star unrecognisable as he is jailed - ne...,-0.4
3,3,Australia's aged-care sector losing 65000 work...,-0.3
4,4,Innovative treatment reduces post-traumatic he...,-0.1


## Remove Special Characters
Removes HTML Tags which may be an issue when copy and pasting from an online article. Also removes commas, quotation marks etc.

In [15]:
#Remove tags and Special characters
#Using Regular Expressions

# Removing HTML Tags
dataset[textColumn] = dataset[textColumn].apply(lambda x: re.sub(r'<.*?>', '', x))

# Removing Special Characters
dataset[textColumn] = dataset[textColumn].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

dataset.head()

Unnamed: 0.1,Unnamed: 0,Headline,Sentiment
0,0,Australia news LIVE Census 2021 results releas...,-0.210526
1,1,ASX opens up BWX dives 38pc Collins Foods Tass...,0.0
2,2,Movie star unrecognisable as he is jailed new...,-0.4
3,3,Australias agedcare sector losing 65000 worker...,-0.3
4,4,Innovative treatment reduces posttraumatic hea...,-0.1


## Convert to Lower Case

In [16]:
#Convert to lower case
dataset[textColumn] = dataset[textColumn].apply(lambda x: x.lower())

dataset.head()

Unnamed: 0.1,Unnamed: 0,Headline,Sentiment
0,0,australia news live census 2021 results releas...,-0.210526
1,1,asx opens up bwx dives 38pc collins foods tass...,0.0
2,2,movie star unrecognisable as he is jailed new...,-0.4
3,3,australias agedcare sector losing 65000 worker...,-0.3
4,4,innovative treatment reduces posttraumatic hea...,-0.1


## Stop Word Removal
Removes stop words that dont add much to the text, 'a' 'in' 'as' 'is' etc

In [17]:
#Remove Stop words
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
dataset[textColumn] = dataset[textColumn].apply(lambda x: [word for word in x.split() if word not in stop_words])

In [18]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Headline,Sentiment
0,0,"[australia, news, live, census, 2021, results,...",-0.210526
1,1,"[asx, opens, bwx, dives, 38pc, collins, foods,...",0.0
2,2,"[movie, star, unrecognisable, jailed, newscomau]",-0.4
3,3,"[australias, agedcare, sector, losing, 65000, ...",-0.3
4,4,"[innovative, treatment, reduces, posttraumatic...",-0.1


## Lemmatization
Dictionary based Stemming, results in more correct meanings and spellings

In [19]:
#Lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
dataset[textColumn] = dataset[textColumn].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [20]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Headline,Sentiment
0,0,"[australia, news, live, census, 2021, result, ...",-0.210526
1,1,"[asx, open, bwx, dive, 38pc, collins, food, ta...",0.0
2,2,"[movie, star, unrecognisable, jailed, newscomau]",-0.4
3,3,"[australia, agedcare, sector, losing, 65000, w...",-0.3
4,4,"[innovative, treatment, reduces, posttraumatic...",-0.1


## Vectorizing
Join words back to sentences as vectorizer works with strings rather than tokens. It tokenizes text itself.

*Might not necessarily be done as a part of cleaning our data, but this is the next step before feeding the data to our model*

In [21]:
#Vectorizing
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

vectorizedData = vectorizer.fit_transform(dataset[textColumn].apply(lambda x: ' '.join(x)))