### ingest data

In [15]:
import pandas as pd

In [16]:
data = pd.read_parquet('../data/data_combined.parquet', engine='pyarrow')
data.head(3)

Unnamed: 0,id,topic,question_title,question_content,best_answer,has_emoji
0,700000,3,Are Monte verde golden toads extict or endange...,I have to do a report and model on an endanger...,"Since 1989, not a single Golden Toad has been ...",False
1,700001,6,"looking for a book titled "" Medical Filing"" by...",,amazon.com \nhttp://www.amazon.com/gp/search/r...,False
2,700002,7,ShoulD i StoP?,Should i stop asking dumb questions? \n\nAm i ...,"why should you stop, it is your life do what y...",False


In [17]:
data_copy = data.copy()

### transform data (copy)

In [18]:
topic_labels = {
    0: "Society & Culture",
    1: "Science & Mathematics",
    2: "Health",
    3: "Education & Reference",
    4: "Computers & Internet",
    5: "Sports",
    6: "Business & Finance",
    7: "Entertainment & Music",
    8: "Family & Relationships",
    9: "Politics & Government"
}

data_copy["topic_name"] = data_copy["topic"].map(topic_labels)
data_copy.head(3)

Unnamed: 0,id,topic,question_title,question_content,best_answer,has_emoji,topic_name
0,700000,3,Are Monte verde golden toads extict or endange...,I have to do a report and model on an endanger...,"Since 1989, not a single Golden Toad has been ...",False,Education & Reference
1,700001,6,"looking for a book titled "" Medical Filing"" by...",,amazon.com \nhttp://www.amazon.com/gp/search/r...,False,Business & Finance
2,700002,7,ShoulD i StoP?,Should i stop asking dumb questions? \n\nAm i ...,"why should you stop, it is your life do what y...",False,Entertainment & Music


In [19]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)

    tokens = word_tokenize(text)

    tokens = [
        lemmatizer.lemmatize(t, pos="v") 
        for t in tokens
        if t not in stop_words
    ]
    return tokens

sample = "This is, perhaps, the best example! But doesn't seem perfect..."
print(clean_text(sample))


['perhaps', 'best', 'example', 'doesnt', 'seem', 'perfect']


In [20]:
data_copy["clean_question_title"] = data_copy["question_title"].apply(
    lambda text: " ".join(clean_text(text))
)

data_copy["clean_question_content"] = data_copy["question_content"].apply(
    lambda text: " ".join(clean_text(text))
)

data_copy["clean_best_answer"] = data_copy["best_answer"].apply(
    lambda text: " ".join(clean_text(text))
)

checking for emojis

In [21]:
import re

emoji_pattern = re.compile("[\U00010000-\U0010FFFF]", flags=re.UNICODE)

data_copy["has_emoji"] = data_copy["clean_question_title"].apply(lambda x: bool(emoji_pattern.search(x)))
data_copy["has_emoji"].unique()

array([False])

In [22]:
data_copy.clean_question_title

0                   monte verde golden toads extict endanger
1          look book title medical file theresa claeys kn...
2                                                       stop
3                                       pet name significant
4                                                     market
                                 ...                        
1399995                                 18th century ad mean
1399996    think theres double standard short women look ...
1399997                                       exactly savant
1399998                                    confuseddoes like
1399999    isnt time impeach george bush leadership arise...
Name: clean_question_title, Length: 1400000, dtype: object

In [23]:
data_copy.clean_question_content

0          report model endanger amphibiani want montever...
1                                                           
2          stop ask dumb question nnam bug youare annoy w...
3                                       whynand call younwhy
4                                                           
                                 ...                        
1399995            get date gunpowder invent need know 1700s
1399996    many 52 male sex symblos know remember prince ...
1399997                                                     
1399998    like menok im teen highskoll n crush guy while...
1399999                                                     
Name: clean_question_content, Length: 1400000, dtype: object

In [24]:
data_copy.clean_best_answer

0          since 1989 single golden toad see anywhere wor...
1          amazoncom nhttpwwwamazoncomgpsearchrefbrsshs10...
2                 stop life want love ask dumb question rock
3          honey bearnbecause look like honey bear cereal...
4          traditionally market place buyers sellers meet...
                                 ...                        
1399995    yupn18 century 1700snnbtw find chinese earlier...
1399996    yes think sonnpersonally short men unattractiv...
1399997                         httpenwikipediaorgwikisavant
1399998    youll never know get nerve talk hes probably l...
1399999    unfortunately process require 23 senate vote i...
Name: clean_best_answer, Length: 1400000, dtype: object

In [25]:
data_copy_processed = data_copy.copy()
data_copy_processed.to_parquet("../data/data_preprocessed.parquet", index=False)