### ingest data

In [1]:
import pandas as pd

In [2]:
data = pd.read_parquet('../data/data.parquet', engine='pyarrow')
data.head(3)

Unnamed: 0,id,topic,question_title,question_content,best_answer
0,0,4,why doesn't an optical mouse work on a glass t...,or even on some surfaces?,Optical mice use an LED and a camera to rapidl...
1,1,5,What is the best off-road motorcycle trail ?,long-distance trail throughout CA,i hear that the mojave road is amazing!<br />\...
2,2,2,What is Trans Fat? How to reduce that?,I heard that tras fat is bad for the body. Wh...,Trans fats occur in manufactured foods during ...


In [3]:
data_copy = data.copy()

### transform data (copy)

In [4]:
topic_labels = {
    0: "Society & Culture",
    1: "Science & Mathematics",
    2: "Health",
    3: "Education & Reference",
    4: "Computers & Internet",
    5: "Sports",
    6: "Business & Finance",
    7: "Entertainment & Music",
    8: "Family & Relationships",
    9: "Politics & Government"
}

data_copy["topic_name"] = data_copy["topic"].map(topic_labels)
data_copy.head(3)

Unnamed: 0,id,topic,question_title,question_content,best_answer,topic_name
0,0,4,why doesn't an optical mouse work on a glass t...,or even on some surfaces?,Optical mice use an LED and a camera to rapidl...,Computers & Internet
1,1,5,What is the best off-road motorcycle trail ?,long-distance trail throughout CA,i hear that the mojave road is amazing!<br />\...,Sports
2,2,2,What is Trans Fat? How to reduce that?,I heard that tras fat is bad for the body. Wh...,Trans fats occur in manufactured foods during ...,Health


In [5]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)

    tokens = word_tokenize(text)

    tokens = [
        lemmatizer.lemmatize(t, pos="v") 
        for t in tokens
        if t not in stop_words
    ]
    return tokens

sample = "This is, perhaps, the best example! But doesn't seem perfect..."
print(clean_text(sample))


['perhaps', 'best', 'example', 'doesnt', 'seem', 'perfect']


In [6]:
data_copy["clean_question_title"] = data_copy["question_title"].apply(
    lambda text: " ".join(clean_text(text))
)

data_copy["clean_question_content"] = data_copy["question_content"].apply(
    lambda text: " ".join(clean_text(text))
)

In [10]:
data_copy.clean_question_title

0                     doesnt optical mouse work glass table
1                             best offroad motorcycle trail
2                                          trans fat reduce
3                                           many plan fedex
4                san francisco bay area make sense rent buy
                                ...                        
699995                                 18th century ad mean
699996    think theres double standard short women look ...
699997                                       exactly savant
699998                                    confuseddoes like
699999    isnt time impeach george bush leadership arise...
Name: clean_question_title, Length: 700000, dtype: object

In [11]:
data_copy.clean_question_content

0                                              even surface
1                          longdistance trail throughout ca
2                    hear tras fat bad body find daily food
3                                hear largest airline world
4         price rent price buy make sense mostly rent co...
                                ...                        
699995            get date gunpowder invent need know 1700s
699996    many 52 male sex symblos know remember prince ...
699997                                                     
699998    like menok im teen highskoll n crush guy while...
699999                                                     
Name: clean_question_content, Length: 700000, dtype: object

In [9]:
data_copy_processed = data_copy.copy()
data_copy_processed.to_parquet("../data/data_preprocessed.parquet", index=False)