# Data Cleaning / Pre-Processing

In [93]:
import pandas as pd

data = pd.read_csv("../csv/uncleaned/tourist_destinations_review_posts_baguio2.csv")
data.dropna()
data

Unnamed: 0,tourist_destination,review_post
0,Burnham Park,An excellent place for family and friends to v...
1,Burnham Park,A lovely place to be on a nice day.We were ple...
2,Burnham Park,Definitely one of the best places to visit in ...
3,Burnham Park,I have always been looking at this park ever s...
4,Burnham Park,Very beautiful park with boating and cycling a...
...,...,...
15029,Teacher's Camp,There is no ghost 👻👻👻Very affordable
15030,Teacher's Camp,Classy place
15031,Teacher's Camp,A beautiful view is cheap
15032,Teacher's Camp,well done


In [94]:
data = data.drop_duplicates(subset=['review_post'])
data

Unnamed: 0,tourist_destination,review_post
0,Burnham Park,An excellent place for family and friends to v...
1,Burnham Park,A lovely place to be on a nice day.We were ple...
2,Burnham Park,Definitely one of the best places to visit in ...
3,Burnham Park,I have always been looking at this park ever s...
4,Burnham Park,Very beautiful park with boating and cycling a...
...,...,...
15029,Teacher's Camp,There is no ghost 👻👻👻Very affordable
15030,Teacher's Camp,Classy place
15031,Teacher's Camp,A beautiful view is cheap
15032,Teacher's Camp,well done


In [95]:
import re
import nltk
from cleantext import clean

from sklearn.feature_extraction import text
stop_words = list(text.ENGLISH_STOP_WORDS)
stop_words[0:5]

['although', 'whereupon', 'de', 'my', 'these']

In [96]:
_data = data.copy()

_data['removed_special_charas_review_post'] = _data['review_post'].map(lambda x: re.sub('[^A-Za-z ]+', ' ', str(x)))
_data


Unnamed: 0,tourist_destination,review_post,removed_special_charas_review_post
0,Burnham Park,An excellent place for family and friends to v...,An excellent place for family and friends to v...
1,Burnham Park,A lovely place to be on a nice day.We were ple...,A lovely place to be on a nice day We were ple...
2,Burnham Park,Definitely one of the best places to visit in ...,Definitely one of the best places to visit in ...
3,Burnham Park,I have always been looking at this park ever s...,I have always been looking at this park ever s...
4,Burnham Park,Very beautiful park with boating and cycling a...,Very beautiful park with boating and cycling a...
...,...,...,...
15029,Teacher's Camp,There is no ghost 👻👻👻Very affordable,There is no ghost Very affordable
15030,Teacher's Camp,Classy place,Classy place
15031,Teacher's Camp,A beautiful view is cheap,A beautiful view is cheap
15032,Teacher's Camp,well done,well done


In [97]:
_data['to_lower_case_review_post'] = _data['removed_special_charas_review_post'].map(lambda x: x.lower())
_data

Unnamed: 0,tourist_destination,review_post,removed_special_charas_review_post,to_lower_case_review_post
0,Burnham Park,An excellent place for family and friends to v...,An excellent place for family and friends to v...,an excellent place for family and friends to v...
1,Burnham Park,A lovely place to be on a nice day.We were ple...,A lovely place to be on a nice day We were ple...,a lovely place to be on a nice day we were ple...
2,Burnham Park,Definitely one of the best places to visit in ...,Definitely one of the best places to visit in ...,definitely one of the best places to visit in ...
3,Burnham Park,I have always been looking at this park ever s...,I have always been looking at this park ever s...,i have always been looking at this park ever s...
4,Burnham Park,Very beautiful park with boating and cycling a...,Very beautiful park with boating and cycling a...,very beautiful park with boating and cycling a...
...,...,...,...,...
15029,Teacher's Camp,There is no ghost 👻👻👻Very affordable,There is no ghost Very affordable,there is no ghost very affordable
15030,Teacher's Camp,Classy place,Classy place,classy place
15031,Teacher's Camp,A beautiful view is cheap,A beautiful view is cheap,a beautiful view is cheap
15032,Teacher's Camp,well done,well done,well done


In [98]:
_data['removed_stop_words_review_post'] = _data['to_lower_case_review_post'].map(lambda x: " ".join(x for x in x.split() if x not in stop_words))
_data

Unnamed: 0,tourist_destination,review_post,removed_special_charas_review_post,to_lower_case_review_post,removed_stop_words_review_post
0,Burnham Park,An excellent place for family and friends to v...,An excellent place for family and friends to v...,an excellent place for family and friends to v...,excellent place family friends visit entrance ...
1,Burnham Park,A lovely place to be on a nice day.We were ple...,A lovely place to be on a nice day We were ple...,a lovely place to be on a nice day we were ple...,lovely place nice day pleasantly surprised ple...
2,Burnham Park,Definitely one of the best places to visit in ...,Definitely one of the best places to visit in ...,definitely one of the best places to visit in ...,definitely best places visit baguio burnham pa...
3,Burnham Park,I have always been looking at this park ever s...,I have always been looking at this park ever s...,i have always been looking at this park ever s...,looking park chance review park flexed view pa...
4,Burnham Park,Very beautiful park with boating and cycling a...,Very beautiful park with boating and cycling a...,very beautiful park with boating and cycling a...,beautiful park boating cycling available insid...
...,...,...,...,...,...
15029,Teacher's Camp,There is no ghost 👻👻👻Very affordable,There is no ghost Very affordable,there is no ghost very affordable,ghost affordable
15030,Teacher's Camp,Classy place,Classy place,classy place,classy place
15031,Teacher's Camp,A beautiful view is cheap,A beautiful view is cheap,a beautiful view is cheap,beautiful view cheap
15032,Teacher's Camp,well done,well done,well done,


In [99]:
from nltk.stem.wordnet import WordNetLemmatizer

lem = WordNetLemmatizer()

_data['lemmatized_words_review_post'] = _data['removed_stop_words_review_post'].map(lambda x: " ".join(lem.lemmatize(str(x)) for x in x.split()))
_data

Unnamed: 0,tourist_destination,review_post,removed_special_charas_review_post,to_lower_case_review_post,removed_stop_words_review_post,lemmatized_words_review_post
0,Burnham Park,An excellent place for family and friends to v...,An excellent place for family and friends to v...,an excellent place for family and friends to v...,excellent place family friends visit entrance ...,excellent place family friend visit entrance f...
1,Burnham Park,A lovely place to be on a nice day.We were ple...,A lovely place to be on a nice day We were ple...,a lovely place to be on a nice day we were ple...,lovely place nice day pleasantly surprised ple...,lovely place nice day pleasantly surprised ple...
2,Burnham Park,Definitely one of the best places to visit in ...,Definitely one of the best places to visit in ...,definitely one of the best places to visit in ...,definitely best places visit baguio burnham pa...,definitely best place visit baguio burnham par...
3,Burnham Park,I have always been looking at this park ever s...,I have always been looking at this park ever s...,i have always been looking at this park ever s...,looking park chance review park flexed view pa...,looking park chance review park flexed view pa...
4,Burnham Park,Very beautiful park with boating and cycling a...,Very beautiful park with boating and cycling a...,very beautiful park with boating and cycling a...,beautiful park boating cycling available insid...,beautiful park boating cycling available insid...
...,...,...,...,...,...,...
15029,Teacher's Camp,There is no ghost 👻👻👻Very affordable,There is no ghost Very affordable,there is no ghost very affordable,ghost affordable,ghost affordable
15030,Teacher's Camp,Classy place,Classy place,classy place,classy place,classy place
15031,Teacher's Camp,A beautiful view is cheap,A beautiful view is cheap,a beautiful view is cheap,beautiful view cheap,beautiful view cheap
15032,Teacher's Camp,well done,well done,well done,,


In [100]:
_data.to_csv('../csv/cleaned/cleaned_variations_review_posts2.csv', index = False)