In [1]:
import json
import numpy as np
from os import getcwd
import pandas as pd
import pickle

In [2]:
from sklearn import preprocessing
import re
from textblob import Word, TextBlob
from string import punctuation as pn
from nltk.stem.snowball import SnowballStemmer
from gensim.parsing.preprocessing import STOPWORDS

In [3]:
# Encoding categorical variable (venue)
def encode_venues(df_train):
    labelencoder = preprocessing.LabelEncoder()
    encoded_labels_venue = labelencoder.fit_transform(df_train['venue'][:].tolist())
    df_train['venues_le'] = encoded_labels_venue
    df_train= df_train.drop(["venue"], axis=1)

    return df_train

def process_row(row):
   # Deleting email:
   row = re.sub('(\S+@\S+)(com|\s+com)', ' ', row)
   # Deleting username:
   row = re.sub('(\S+@\S)', ' ', row)
   # punctuation & lower case:
   punctuation = pn + '—“,”‘-’'
   row = ''.join(char.lower() for char in row if char not in punctuation)
   # Erasing stopword, converting plurals into singular, detach punctuation
   stop = STOPWORDS
   row = TextBlob(row)
   row = ' '.join(Word(word).lemmatize() for word in row.words if word not in stop)

   # Bring word to its root form
   stemmer = SnowballStemmer('english')
   row = ' '.join([stemmer.stem(word) for word in row.split() if len(word) > 2])
   # Erase extra white space
   row = re.sub('\s{1,}', ' ', row)

   return row



In [4]:
def set_path():
# Set working directory to location of the file
   abspath = getcwd()
   dname = os.path.dirname(abspath)
   os.chdir(dname)
set_path()

In [5]:


# Open raw data
df_train = pd.read_pickle("data/processed/dirty_df.pkl")

# Call cleaning functions
df_train = encode_venues(df_train)
df_train['title'] = df_train['title'].apply(process_row)
df_train['abstract'] = df_train['abstract'].apply(process_row)




In [6]:
# Merging title and abstract
def mergingtext(df):
   full_content = []
   for i in range(len(df)):
      fulltext = df.iloc[i]['title'] + df.iloc[i]['abstract']
      full_content.append(fulltext)
   df['content'] = full_content
 
   


In [7]:
mergingtext(df_train)
df_train = df_train[['title','abstract','content', 'year', 'venues_le','authorId' ]].copy()

In [9]:
#labeling author id
auth_le = preprocessing.LabelEncoder()
authid_enc = auth_le.fit_transform(df_train['authorId'])
df_train['authId_enc'] = authid_enc
del df_train['authorId']

with open("code/authorIdlabel.pkl", 'wb') as f:
      pickle.dump(file=f, obj=auth_le)

##

### Spliting data

In [11]:
from collections import Counter
import random

In [12]:
# spliting

count = Counter(df_train['authId_enc'])
frequentAuthor = list({k:v for k,v in count.items() if count[k] >=3}.keys())
print(len(frequentAuthor))
random.shuffle(frequentAuthor)

val_id = []
for auth in frequentAuthor[:500]:
   for i in range(len(df_train)):
      if df_train.iloc[i]['authId_enc'] == auth:
         val_id.append(i)
         break
train_id = [i for i in df_train.index if i not in val_id]  




1705


In [13]:
len(val_id)

500

In [14]:

train_clean_df = df_train.iloc[train_id]
train_clean_df.reset_index(inplace=True, drop = True)
val_clean_df = df_train.loc[val_id]
val_clean_df.reset_index(inplace=True, drop = True)

In [15]:
train_clean_df.to_pickle("data/processed/train_clean_df.pkl")
val_clean_df.to_pickle("data/processed/val_clean_df.pkl")

### Cleaning test data

In [16]:
# Open raw data
df_test = pd.read_pickle("data/processed/test_dirty_df.pkl")

# Call cleaning functions
df_test = encode_venues(df_test)
df_test['title'] = df_test['title'].apply(process_row)
df_test['abstract'] = df_test['abstract'].apply(process_row)

mergingtext(df_test)


# write back to processed folder
df_test.to_pickle("data/processed/test_clean_df.pkl")