In [104]:
import pandas as pd
# Preprocessing
import re
import preprocessor as p
import string
import contractions
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.corpus import stopwords

In [105]:
# Download requires corpus
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

<h2> Consolidation </h2>

In [106]:
%%script false
# Consolidate all csvs into one dataframe then export as csv
fb_df = pd.read_csv('./data/social_media/Facebook_Consolidated.csv')
fb_df["platform"] = "Facebook"
reddit_df = pd.read_csv('./data/social_media/reddit.csv')
reddit_df["platform"] = "Reddit"
tiktok_df = pd.read_excel('./data/social_media/Tiktok.xlsx')
tiktok_df["platform"] = "Tiktok"
youtube_df = pd.read_csv('./data/social_media/Youtube.csv')
youtube_df["platform"] = "Youtube"
# Create master_series
text = fb_df["0"].append(reddit_df["body"], ignore_index=True).append(tiktok_df["Comment Text"], ignore_index=True).append(youtube_df["Comment"], ignore_index=True)
platform = fb_df["platform"].append(reddit_df["platform"], ignore_index=True).append(tiktok_df["platform"], ignore_index=True).append(youtube_df["platform"], ignore_index=True)
# Create master_df
master_df = pd.DataFrame({"text": text, "platform": platform})
# Drop master_df NaN
master_df = master_df[master_df['text'].notna()]
master_df.to_csv('./data/consolidated.csv', index=False)


Couldn't find program: 'false'


<h2> Preprocessing </h2>
TODO: Remove entries that have only 1 word?

In [107]:
dataset = pd.read_csv('./data/consolidated.csv')
dataset=dataset.rename(columns = {'text':'raw'})
dataset = dataset[["platform", "raw"]]
# dataset = pd.read_csv('./data/sample.csv')
# dataset.columns = ['raw']

In [108]:
# Create new column for prepr preprocessed
dataset["preprocessed"] = dataset["raw"]

In [109]:
dataset.head()

Unnamed: 0,platform,raw,preprocessed
0,Facebook,Why nowadays every thing seem to be increasin...,Why nowadays every thing seem to be increasin...
1,Facebook,I will have to disagree.. we’re not that high!!,I will have to disagree.. we’re not that high!!
2,Facebook,"Wag po tayong mag-alala. Naniniwala po ako, is...","Wag po tayong mag-alala. Naniniwala po ako, is..."
3,Facebook,Ok lang yang lahat naman nang bansa ganyan. Sa...,Ok lang yang lahat naman nang bansa ganyan. Sa...
4,Facebook,Sama-sama tayong BABAON muli.,Sama-sama tayong BABAON muli.


In [110]:
lemma = WordNetLemmatizer()

# stop words
stop = set(stopwords.words('english'))
# include filipino stopwords
f = open("./data/stopwords.txt", "r")
for line in f:
    stop.add(line.strip())
f = open("./data/stopwords2.txt", "r")
for line in f:
    stop.add(line.strip())
f.close()

# Helper functions
def remove_stopwords(text):
    words = [w for w in text if w not in stop]
    return words

def lemmatize_text(tokenized_text):
    return [lemma.lemmatize(w) for w in tokenized_text]

In [111]:
len(dataset)

5209

In [112]:
# Drop duplicates
dataset.drop_duplicates(keep = False, inplace = True)
# Convert to lowercase
dataset["preprocessed"] = [entry.lower() for entry in dataset["preprocessed"]]
# Remove punctuation marks
dataset["preprocessed"] = [entry.translate(str.maketrans('', '', string.punctuation)) for entry in dataset["preprocessed"]]
# Remove numbers
dataset["preprocessed"] = [re.sub(r'\d+', '', entry) for entry in dataset["preprocessed"]]
# Twitter preprocess
dataset["preprocessed"] = [p.clean(entry) for entry in dataset["preprocessed"]]
# Tokenization
dataset['preprocessed']= [word_tokenize(entry) for entry in dataset['preprocessed']]
# Normalizatinon
dataset['preprocessed'] = dataset['preprocessed'].apply(lemmatize_text)
# Remove stopwords
dataset["preprocessed"] = [remove_stopwords(entry) for entry in dataset["preprocessed"]]
# Remove NA
dataset.dropna(inplace=True)
# Drop rows with preprocessed tokens length less than 2
dataset["length"] = [len(entry) for entry in dataset["preprocessed"]]
dataset = dataset[dataset["length"] > 1]

In [117]:
len(dataset)

4725

In [113]:
dataset

Unnamed: 0,platform,raw,preprocessed,length
0,Facebook,Why nowadays every thing seem to be increasin...,"[nowadays, every, thing, seem, increasing, gov...",17
1,Facebook,I will have to disagree.. we’re not that high!!,"[disagree, high]",2
2,Facebook,"Wag po tayong mag-alala. Naniniwala po ako, is...","[wag, tayong, magalala, naniniwala, isusuprise...",16
3,Facebook,Ok lang yang lahat naman nang bansa ganyan. Sa...,"[ok, yang, nang, bansa, ganyan, selfish, fanat...",8
4,Facebook,Sama-sama tayong BABAON muli.,"[samasama, tayong, babaon]",3
...,...,...,...,...
5204,Youtube,mukhang nakashabu,"[mukhang, nakashabu]",2
5205,Youtube,Bbm is mixed up and confusing on economic terms,"[bbm, mixed, confusing, economic, term]",5
5206,Youtube,Mr.utal utal,"[mrutal, utal]",2
5207,Youtube,Kung Si Leni Pa Yan \r\n\r\nLutang Na Tayo,"[leni, lutang]",2


In [115]:
dataset.to_csv('./data/preprocessed.csv', index=False)

In [116]:
# preprocessed = pd.read_csv('./data/preprocessed.csv')
# df = pd.DataFrame({"raw":dataset["text"], "preprocessed":preprocessed["text"]})