In [2]:
import json
import numpy as np
from os import getcwd
import pandas as pd

In [3]:
from sklearn import preprocessing
import re
from textblob import Word, TextBlob
from string import punctuation as pn
from nltk.stem.snowball import SnowballStemmer
from gensim.parsing.preprocessing import STOPWORDS

In [4]:
# Encoding categorical variable (venue)
def encode_venues(df_train):
    labelencoder = preprocessing.LabelEncoder()
    encoded_labels_venue = labelencoder.fit_transform(df_train['venue'][:].tolist())
    df_train['venues_le'] = encoded_labels_venue
    df_train= df_train.drop(["venue"], axis=1)

    return df_train

def process_row(row):
   # Deleting email:
   row = re.sub('(\S+@\S+)(com|\s+com)', ' ', row)
   # Deleting username:
   row = re.sub('(\S+@\S)', ' ', row)
   # punctuation & lower case:
   punctuation = pn + '—“,”‘-’'
   row = ''.join(char.lower() for char in row if char not in punctuation)
   # Erasing stopword, converting plurals into singular, detach punctuation
   stop = STOPWORDS
   row = TextBlob(row)
   row = ' '.join(Word(word).lemmatize() for word in row.words if word not in stop)

   # Bring word to its root form
   stemmer = SnowballStemmer('english')
   row = ' '.join([stemmer.stem(word) for word in row.split() if len(word) > 2])
   # Erase extra white space
   row = re.sub('\s{1,}', ' ', row)

   return row



In [5]:
# Set working directory to location of the file
abspath = getcwd()
dname = os.path.dirname(abspath)
os.chdir(dname)

# Open raw data
df_train = pd.read_pickle("data/processed/dirty_df.pkl")

# Call cleaning functions
df_train = encode_venues(df_train)
df_train['title'] = df_train['title'].apply(process_row)
df_train['abstract'] = df_train['abstract'].apply(process_row)

# write back to processed folder
df_train.to_pickle("data/processed/clean_df.pkl")