In [1]:
TRAIN_PATH = "../data/raw/train.tsv"
VAL_PATH = "../data/raw/validation.tsv"
TEST_PATH = "../data/raw/test.tsv"

TRAIN_EXPORT_PATH = "../data/processed/00_train_df.pkl"
VAL_EXPORT_PATH = "../data/processed/00_validation_df.pkl"
TEST_EXPORT_PATH = "../data/processed/00_test_df.pkl"

In [2]:
# load packages
import pandas as pd
import numpy as np

from ast import literal_eval
import re

import nltk
from nltk.corpus import stopwords

pd.set_option('display.max_colwidth', None)

___
## Functions:

In [3]:
def read_data(file_path):
    """
    Read tap separated data; we applied the literal_eval function
    on it so as to evaluate the exact data type of tags from the file
    """
    data = pd.read_csv(file_path, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words("english"))

def text_prepare(text):
    """
    text: a string to process
    return: modified initial string
    """
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in STOPWORDS])
    return text

In [5]:
def text_stemmer(text):
    """
    A stemmer functoin that takes a text as input; tokenize and then normalize it
    returns the same text after stemming
    """
    stemmer = nltk.stem.PorterStemmer()
    stems = [stemmer.stem(token) for token in text.split()]
    return " ".join(stems)

In [6]:
def text_lemmatizer(text):
    """
    A stemmer functoin that takes a text as input; tokenize and then normalize it
    returns the same text after Lemmatizing
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in text.split()]
    return " ".join(lemmas)

___
## Read data

In [7]:
raw_train = read_data(TRAIN_PATH)
raw_val = read_data(VAL_PATH)
raw_test = pd.read_csv(TEST_PATH, sep='\t')

In [8]:
raw_train.sample(5)

Unnamed: 0,title,tags
41465,UnknownHostException accessing service on virtualBox,"[java, linux]"
797,Relationship/Node Charts/Graphs... Using Javascript or something else?,[javascript]
75481,Padding a list in python with particular value,"[python, list]"
45237,How to Integrate Paypal IPN for Custom Shopping Cart without using Buy Now Buttons,[php]
50923,Create a dynamic website with Java,"[java, html]"


In [9]:
raw_test.sample(5)

Unnamed: 0,title
17841,JAVA Linked List Confused on why I can't check a variable against an node in the Linked List?
17241,Sorting a collection of HashSets
14445,how to split json format string in order to Deserialize is into .net object?
1925,sorting csv data by column C#
17124,Unable to install requests with pip in virtual ennvironment built using pyvenv-3.4


___
## Prepare title column:
One of the most known difficulities when working with natural data is that it's unstructured. For example, if used "as is" and extract tokens just by splitting the titles by whitespaces, there would be many weird tokens like (3.5?,"flip), etc. To prevent the problems, it's usually useful to prepare the data somehow. We are gonna use a RegEx to replace these weird tokens with whitespaces.

In [10]:
sample = np.random.randint(0, len(raw_train)+1)
print("Before:")
print(raw_train['title'].values[sample])
print()
print("After:")
print(text_prepare(raw_train['title'].values[sample]))

Before:
How to use lombok.Data annotation in a Spring Boot application?

After:
use lombokdata annotation spring boot application


**Now preprocess title columns to acceptable form**

In [11]:
train_df = raw_train.copy()
val_df = raw_val.copy()
test_df = raw_test.copy()

In [12]:
train_df['title'] = train_df['title'].apply(text_prepare)
val_df['title'] = val_df['title'].apply(text_prepare)
test_df['title'] = test_df['title'].apply(text_prepare)

**Now a quick sanity check**

In [13]:
sample = np.random.randint(0, len(raw_train)+1)
pd.DataFrame([raw_train['title'][sample],
          train_df['title'][sample]],
         index=["raw", "processed"], columns=[f'sample ({sample})'])

Unnamed: 0,sample (15250)
raw,pandas dataframe to dictionary value
processed,pandas dataframe dictionary value


## Export data

In [14]:
train_df.to_pickle(TRAIN_EXPORT_PATH)
val_df.to_pickle(VAL_EXPORT_PATH)
test_df.to_pickle(TEST_EXPORT_PATH)

___
## In case we want to normalize our text using stemming or lemmatizing:

In [15]:
train_df['title'][:20].apply(text_stemmer)

0                                                                     draw stack dotplot r
1                                      mysql select record datetim field less specifi valu
2                                                               termin window phone 81 app
3                                               get current time specif countri via jqueri
4                                                                  configur tomcat use ssl
5                                awesom nest set plugin add new children tree variou level
6                                                       creat map json respons rubi rail 3
7                                                                   rspec test method call
8                                                      springboot catalina lifecycl except
9                                                  import data excel mysql databas use php
10    obtain object javalangclasst object parameter type without construct class q_uestion

In [16]:
train_df['title'][:20].apply(text_lemmatizer)

0                                                                             draw stacked dotplot r
1                                              mysql select record datetime field le specified value
2                                                                      terminate window phone 81 app
3                                                       get current time specific country via jquery
4                                                                         configuring tomcat use ssl
5                                         awesome nested set plugin add new child tree various level
6                                                               create map json response ruby rail 3
7                                                                           rspec test method called
8                                                            springboot catalina lifecycle exception
9                                                         import data excel mysql database 