<a href="https://colab.research.google.com/github/EmmaCOo/ADS509_Text_Mining_Final_Project/blob/main/Final_Project_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###**ADS509_TEXT_MINING_FINAL_PROJECT - DATA PREPROCESSING**

**EMMA OO**



In [None]:
!pip install emoji==1.7

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import re
import emoji
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
from nltk.corpus import stopwords
from string import punctuation

import nltk
nltk.download('stopwords')

sw = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#load all csv files
burger = pd.read_csv('/content/drive/MyDrive/Text_Mining_Final_Project/burgerk.csv')
jackbox = pd.read_csv('/content/drive/MyDrive/Text_Mining_Final_Project/jack.csv')
innout = pd.read_csv('/content/drive/MyDrive/Text_Mining_Final_Project/innout.csv')
chickfila = pd.read_csv('/content/drive/MyDrive/Text_Mining_Final_Project/chickfila.csv')
sonic = pd.read_csv('/content/drive/MyDrive/Text_Mining_Final_Project/sonic.csv')

In [None]:
#Add the class columns to each dataframe
burger['name'] = 'BurgerKing'
jackbox['name'] = 'JackInTheBox'
innout['name'] = 'Inn-N-Out'
chickfila['name'] = 'Chick-Fil-A'
sonic['name'] = 'Sonic'

In [None]:
#Merge all dataframes
frames = [burger,jackbox, innout, chickfila, sonic]
df = pd.concat(frames)
df.shape

(56176, 8)

In [None]:
#save only the unique followers to avoid the followers that follow more than one burger chains
clean_df =  df.groupby('screen_name').filter(lambda x : len(x)<=1)
clean_df.shape

(51113, 8)

In [None]:
clean_df['screen_name'].describe()

count           51113
unique          51113
top       chief_immhi
freq                1
Name: screen_name, dtype: object

In [None]:
clean_df['name'].value_counts()

JackInTheBox    13541
Chick-Fil-A     13360
Sonic           10123
Inn-N-Out        8202
BurgerKing       5887
Name: name, dtype: int64

In [None]:
clean_df.head()

Unnamed: 0,ID,screen_name,name,location,follower_count,friends_count,like_count,description
0,1374753560659902468,chief_immhi,BurgerKing,,65,774,,Public journal | Getting 1% better everyday
1,1574533805871857666,VirtuAli8474,BurgerKing,Hell,0,104,,He/They Gaymer Artist :D Discord: alii #8474
3,1248367068895940608,nyc_misael,BurgerKing,"New York, USA",8,925,,🤨
4,365843255,Jillers86,BurgerKing,Canada,1250,1043,,"Retired @Twitch OG, gamer, wife, fur mama, & #..."
5,1457542240788693000,Prixnncess29,BurgerKing,,11,56,,🇲🇽💞


###**TEXT MINING**

In [None]:
#substract only descriptin (text data) and classes
text_df = clean_df[['description','name']]
text_df.head()

Unnamed: 0,description,name
0,Public journal | Getting 1% better everyday,BurgerKing
1,He/They Gaymer Artist :D Discord: alii #8474,BurgerKing
3,🤨,BurgerKing
4,"Retired @Twitch OG, gamer, wife, fur mama, & #...",BurgerKing
5,🇲🇽💞,BurgerKing


###**DATA CLEANING**

In [None]:
punctuation = set(punctuation) # speeds up comparison

def remove_punc(text):
  text_nonpunc ="".join([char for char in text if char not in punctuation])
  return text_nonpunc

# Define Tokenization Function
#To keep the emojis while tokenizing
RE_TOKEN = re.compile(r"""
                   ( [#]?[@\w'’\.\-\:]*\w     # words, hashtags and email addresses
                   | [:;<]\-?[\)\(3]          # coarse pattern for basic text emojis
                   | [\U0001F100-\U0001FFFF]  # coarse code range for unicode emojis
                   )
                  """, re.VERBOSE)

def tokenize(text):
  return text.split()

# Define Stop Words Remove Function
stopwords = set(nltk.corpus.stopwords.words('english'))

def remove_stop(text):
  return [t for t in text if t not in stopwords]


# Define pipeline of lowering, no punctuation, tokenization, and removal of stopwords
pipeline = [str.lower, remove_punc,tokenize, remove_stop]
def prepare(text, pipeline): 
  tokens = text
  for transform in pipeline: tokens = transform(tokens)
  return tokens

In [None]:
# change the data frame to string data types
text_df = text_df.astype(str)

#replace Na values with '' 
text_df['description'] = text_df['description'].replace(np.nan, '')

In [None]:
text_df['clean_desc'] = text_df['description'].apply(prepare, pipeline = pipeline)
text_df.head()

Unnamed: 0,description,name,clean_desc
0,Public journal | Getting 1% better everyday,BurgerKing,"[public, journal, getting, 1, better, everyday]"
1,He/They Gaymer Artist :D Discord: alii #8474,BurgerKing,"[hethey, gaymer, artist, discord, alii, 8474]"
3,🤨,BurgerKing,[🤨]
4,"Retired @Twitch OG, gamer, wife, fur mama, & #...",BurgerKing,"[retired, twitch, og, gamer, wife, fur, mama, ..."
5,🇲🇽💞,BurgerKing,[🇲🇽💞]
