In [1]:
""" Import Packages """

import pandas as pd
import re 

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [2]:
""" Load scraped data file """

data = pd.read_csv('total_aussieindustries.csv', encoding='utf-8')
data.head()

Unnamed: 0,Job Title,Company,Location,Job Description,Salary,Rating,Reviews,Search Occupation,Search Location,Occupation,Industry
0,Barista - The Coffee Club Sydney Olympic Park,The Coffee Club,Sydney Olympic Park NSW,Why work for us as a Barista at The Coffee Clu...,,3.6,442.0,barista,sydney,Barista,Accommodation and Food Services
1,Barista Position at Sorry Thanks I Love You,Sorry Thanks I Love You,Sydney NSW,Sorry Thanks I Love YouWe've just unveiled our...,,,,barista,sydney,Barista,Accommodation and Food Services
2,Retail & Cafe Attendant,SEA LIFE Sydney,Sydney NSW,"Retail & Cafe AttendantSEA LIFE Sydney, 1-5 Wh...",,,,barista,sydney,Barista,Accommodation and Food Services
3,Boardroom Waiter/Barista,The Clifford Wallace Agency,Sydney NSW,Boardroom Waiter/BaristaCasual SydneyClifford ...,,4.7,3.0,barista,sydney,Barista,Accommodation and Food Services
4,Baristas,The Grounds of Alexandria,Sydney NSW 2000,"As a Grounds Barista, you will have extensive ...",,4.0,15.0,barista,sydney,Barista,Accommodation and Food Services


In [3]:
""" Checking type in column """

type(data['Job Description'])

pandas.core.series.Series

In [4]:
""" Converting Job Description column to string """

data['Job Description'] = data['Job Description'].astype(str)
print(data['Job Description'][0])

Why work for us as a Barista at The Coffee Club?Because here at The Coffee Club we are committed to making every member of our team feel part of our extended family — proud of who they are and empowered in their role to deliver on our brand vision.Thrive in a fast paced and dynamic team environment where you will be essential to consistently create the 'Happy Place' for our customers. In this role, coffee is the hero and your barista skills will shine!Here’s the story...From the very beginning The Coffee Club was founded on family, friends and customer needs. The idea was to create a place that was more than just somewhere for people to meet for coffee; it would be more casual, comfortable – yet sophisticated, stylish – yet affordable. Now serving millions of loyal customers as the world's largest Australian café chain.You are:Someone who understands that we’re The Coffee Club and ‘coffee’ is our middle name. We are serious about coffee - it is central to everything we do. So if you’re

In [5]:
""" Creating column for cleaned data and removing all strings starting with http """

data['clean'] = data['Job Description'].str.replace(r'http\S+', '', regex = True)
print(data['clean'][0])


Why work for us as a Barista at The Coffee Club?Because here at The Coffee Club we are committed to making every member of our team feel part of our extended family — proud of who they are and empowered in their role to deliver on our brand vision.Thrive in a fast paced and dynamic team environment where you will be essential to consistently create the 'Happy Place' for our customers. In this role, coffee is the hero and your barista skills will shine!Here’s the story...From the very beginning The Coffee Club was founded on family, friends and customer needs. The idea was to create a place that was more than just somewhere for people to meet for coffee; it would be more casual, comfortable – yet sophisticated, stylish – yet affordable. Now serving millions of loyal customers as the world's largest Australian café chain.You are:Someone who understands that we’re The Coffee Club and ‘coffee’ is our middle name. We are serious about coffee - it is central to everything we do. So if you’re

In [6]:
""" Inserting whitespace in between all punctuation """

data['clean'] = data['clean'].str.replace(r'[^A-Za-z0-9]+', ' ', regex = True)
print(data['clean'][0])


Why work for us as a Barista at The Coffee Club Because here at The Coffee Club we are committed to making every member of our team feel part of our extended family proud of who they are and empowered in their role to deliver on our brand vision Thrive in a fast paced and dynamic team environment where you will be essential to consistently create the Happy Place for our customers In this role coffee is the hero and your barista skills will shine Here s the story From the very beginning The Coffee Club was founded on family friends and customer needs The idea was to create a place that was more than just somewhere for people to meet for coffee it would be more casual comfortable yet sophisticated stylish yet affordable Now serving millions of loyal customers as the world s largest Australian caf chain You are Someone who understands that we re The Coffee Club and coffee is our middle name We are serious about coffee it is central to everything we do So if you re a Barista at The Coffee 

In [7]:
""" Quite a lot of words are merged, but have an upper case letter (e.g) LetterIF. 
Replacing lower/upper case pattern with whitespace. """

data['clean'] = data['clean'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
print(data['clean'][0])


  data['clean'] = data['clean'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')


Why work for us as a Barista at The Coffee Club Because here at The Coffee Club we are committed to making every member of our team feel part of our extended family proud of who they are and empowered in their role to deliver on our brand vision Thrive in a fast paced and dynamic team environment where you will be essential to consistently create the Happy Place for our customers In this role coffee is the hero and your barista skills will shine Here s the story From the very beginning The Coffee Club was founded on family friends and customer needs The idea was to create a place that was more than just somewhere for people to meet for coffee it would be more casual comfortable yet sophisticated stylish yet affordable Now serving millions of loyal customers as the world s largest Australian caf chain You are Someone who understands that we re The Coffee Club and coffee is our middle name We are serious about coffee it is central to everything we do So if you re a Barista at The Coffee 

In [8]:
""" Now we can lower case all text """

data['clean']= data['clean'].apply(lambda x: x.lower())
print(data['clean'][0])

why work for us as a barista at the coffee club because here at the coffee club we are committed to making every member of our team feel part of our extended family proud of who they are and empowered in their role to deliver on our brand vision thrive in a fast paced and dynamic team environment where you will be essential to consistently create the happy place for our customers in this role coffee is the hero and your barista skills will shine here s the story from the very beginning the coffee club was founded on family friends and customer needs the idea was to create a place that was more than just somewhere for people to meet for coffee it would be more casual comfortable yet sophisticated stylish yet affordable now serving millions of loyal customers as the world s largest australian caf chain you are someone who understands that we re the coffee club and coffee is our middle name we are serious about coffee it is central to everything we do so if you re a barista at the coffee 

In [9]:
""" Selecting tokenizer from nltk """
tokenizer = nltk.tokenize.TreebankWordTokenizer()

In [10]:
""" Tokenize text """

data['clean']= data['clean'].apply(lambda x: tokenizer.tokenize(x))
print(data['clean'][0])

['why', 'work', 'for', 'us', 'as', 'a', 'barista', 'at', 'the', 'coffee', 'club', 'because', 'here', 'at', 'the', 'coffee', 'club', 'we', 'are', 'committed', 'to', 'making', 'every', 'member', 'of', 'our', 'team', 'feel', 'part', 'of', 'our', 'extended', 'family', 'proud', 'of', 'who', 'they', 'are', 'and', 'empowered', 'in', 'their', 'role', 'to', 'deliver', 'on', 'our', 'brand', 'vision', 'thrive', 'in', 'a', 'fast', 'paced', 'and', 'dynamic', 'team', 'environment', 'where', 'you', 'will', 'be', 'essential', 'to', 'consistently', 'create', 'the', 'happy', 'place', 'for', 'our', 'customers', 'in', 'this', 'role', 'coffee', 'is', 'the', 'hero', 'and', 'your', 'barista', 'skills', 'will', 'shine', 'here', 's', 'the', 'story', 'from', 'the', 'very', 'beginning', 'the', 'coffee', 'club', 'was', 'founded', 'on', 'family', 'friends', 'and', 'customer', 'needs', 'the', 'idea', 'was', 'to', 'create', 'a', 'place', 'that', 'was', 'more', 'than', 'just', 'somewhere', 'for', 'people', 'to', 'mee

In [11]:
""" Loading and inspecting stopwords """

stops = list(stopwords.words('english'))
print(stops)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [12]:
""" Removing stopwords """

data['clean']= data['clean'].apply(lambda x: [word for word in x if word not in stops])
print(data['clean'][0])

['work', 'us', 'barista', 'coffee', 'club', 'coffee', 'club', 'committed', 'making', 'every', 'member', 'team', 'feel', 'part', 'extended', 'family', 'proud', 'empowered', 'role', 'deliver', 'brand', 'vision', 'thrive', 'fast', 'paced', 'dynamic', 'team', 'environment', 'essential', 'consistently', 'create', 'happy', 'place', 'customers', 'role', 'coffee', 'hero', 'barista', 'skills', 'shine', 'story', 'beginning', 'coffee', 'club', 'founded', 'family', 'friends', 'customer', 'needs', 'idea', 'create', 'place', 'somewhere', 'people', 'meet', 'coffee', 'would', 'casual', 'comfortable', 'yet', 'sophisticated', 'stylish', 'yet', 'affordable', 'serving', 'millions', 'loyal', 'customers', 'world', 'largest', 'australian', 'caf', 'chain', 'someone', 'understands', 'coffee', 'club', 'coffee', 'middle', 'name', 'serious', 'coffee', 'central', 'everything', 'barista', 'coffee', 'club', 'need', 'following', 'attributes', 'coffee', 'lover', 'passionate', 'making', 'perfect', 'cup', 'coffee', 'eve

In [13]:
print(data['clean'][0])

['work', 'us', 'barista', 'coffee', 'club', 'coffee', 'club', 'committed', 'making', 'every', 'member', 'team', 'feel', 'part', 'extended', 'family', 'proud', 'empowered', 'role', 'deliver', 'brand', 'vision', 'thrive', 'fast', 'paced', 'dynamic', 'team', 'environment', 'essential', 'consistently', 'create', 'happy', 'place', 'customers', 'role', 'coffee', 'hero', 'barista', 'skills', 'shine', 'story', 'beginning', 'coffee', 'club', 'founded', 'family', 'friends', 'customer', 'needs', 'idea', 'create', 'place', 'somewhere', 'people', 'meet', 'coffee', 'would', 'casual', 'comfortable', 'yet', 'sophisticated', 'stylish', 'yet', 'affordable', 'serving', 'millions', 'loyal', 'customers', 'world', 'largest', 'australian', 'caf', 'chain', 'someone', 'understands', 'coffee', 'club', 'coffee', 'middle', 'name', 'serious', 'coffee', 'central', 'everything', 'barista', 'coffee', 'club', 'need', 'following', 'attributes', 'coffee', 'lover', 'passionate', 'making', 'perfect', 'cup', 'coffee', 'eve

In [14]:
""" Checking data type for row in column """

type(data['clean'][0])

list

In [15]:
""" Save cleaned data to dataframe to .csv file """

data.to_csv("data_cleaned_properly.csv")