In [1]:
import json
import numpy as np
from os import getcwd
import pandas as pd
import pickle

In [2]:
from sklearn import preprocessing
import re
from textblob import Word, TextBlob
from string import punctuation as pn
from nltk.stem.snowball import SnowballStemmer
from gensim.parsing.preprocessing import STOPWORDS

In [3]:
# Encoding categorical variable (venue)
def encode_venues(df_train):
    labelencoder = preprocessing.LabelEncoder()
    encoded_labels_venue = labelencoder.fit_transform(df_train['venue'][:].tolist())
    df_train['venues_le'] = encoded_labels_venue
    df_train= df_train.drop(["venue"], axis=1)


def process_row(row):
   # Deleting email:
   row = re.sub('(\S+@\S+)(com|\s+com)', ' ', row)
   # Deleting username:
   row = re.sub('(\S+@\S)', ' ', row)
   # punctuation & lower case:
   punctuation = pn + '—“,”‘-’'
   row = ''.join(char.lower() for char in row if char not in punctuation)
   # Erasing stopword, converting plurals into singular, detach punctuation
   stop = STOPWORDS
   row = TextBlob(row)
   row = ' '.join(Word(word).lemmatize() for word in row.words if word not in stop)

   # Bring word to its root form
   stemmer = SnowballStemmer('english')
   row = ' '.join([stemmer.stem(word) for word in row.split() if len(word) > 2])
   # Erase extra white space
   row = re.sub('\s{1,}', ' ', row)

   return row



In [4]:
def set_path():
# Set working directory to location of the file
   abspath = getcwd()
   dname = os.path.dirname(abspath)
   os.chdir(dname)
   
set_path()

In [5]:
# Open raw data
df_train = pd.read_pickle("data/processed/dirty_df.pkl")
df_train.shape


(12129, 7)

### Delete duplicate

In [6]:
df_cleaned=df_train.drop([df_train.index[10230],df_train.index[12039],df_train.index[11910],df_train.index[10221],df_train.index[10559],df_train.index[7177],df_train.index[5839],df_train.index[2070], df_train.index[2459],df_train.index[2612]])

In [7]:
dubbel = df_cleaned[df_cleaned['title'].str.contains("Publications Received")]
dubbel

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue


In [8]:
df_cleaned[df_cleaned['title'].duplicated() == True]

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue


In [9]:
df_cleaned.shape

(12119, 7)

### Processing text, re-label author, drop unnessary columns

##### Processing texts

In [10]:
# Call cleaning functions
encode_venues(df_cleaned)
df_cleaned['title'] = df_cleaned['title'].apply(process_row)
df_cleaned['abstract'] = df_cleaned['abstract'].apply(process_row)


In [11]:
def mergingtext(df):
   full_content = []
   for i in range(len(df)):
      fulltext = df.iloc[i]['title'] + ' ' + df.iloc[i]['abstract']
      full_content.append(fulltext)
   df['content'] = full_content

mergingtext(df_cleaned)

##### Re-label author

### 

In [12]:
#labeling author id
auth_le = preprocessing.LabelEncoder()
authid_enc = auth_le.fit_transform(df_cleaned['authorId'])
df_cleaned['authId_enc'] = authid_enc
df_cleaned = df_cleaned[['content','authId_enc' ]].copy()
df_cleaned.reset_index(inplace=True, drop = True)
df_cleaned[:3]

Unnamed: 0,content,authId_enc
0,detect linguist idiosyncrat interest autism di...,1571
1,bigram bilstm neural network sequenti metaphor...,1250
2,factual effici integr relev fact visual questi...,4134


In [13]:
# saving author label to true authorId
with open("code/authorIdlabel.pkl", 'wb') as f:
      pickle.dump(file=f, obj=auth_le)




##

### Spliting data

In [15]:
from collections import Counter
import random

In [16]:
# choosing 500 authors that have more than 4 encounters in the dataset

count = Counter(df_cleaned['authId_enc'])
frequentAuthor = list({k:v for k,v in count.items() if count[k] >=3}.keys())
len(frequentAuthor)



1702

In [17]:
random.shuffle(frequentAuthor)

val_id = []
for auth in frequentAuthor:
   for i in range(len(df_cleaned)):
      if df_cleaned.iloc[i]['authId_enc'] == auth:
         val_id.append(i)
         break
train_id = [i for i in df_cleaned.index if i not in val_id]  


In [18]:
train_df = df_cleaned.iloc[train_id]
train_df.reset_index(inplace=True, drop = True)
val_df = df_cleaned.iloc[val_id]
val_df.reset_index(inplace=True, drop = True)

In [19]:
train_df.shape, val_df.shape


((10417, 2), (1702, 2))

In [20]:
train_df.to_pickle("data/processed/train_clean_df.pkl")
val_df.to_pickle("data/processed/val_clean_df.pkl")

### Cleaning test data

In [21]:
# Open raw data
df_test = pd.read_pickle("data/processed/test_dirty_df.pkl")

# Call cleaning functions
encode_venues(df_test)
df_test['title'] = df_test['title'].apply(process_row)
df_test['abstract'] = df_test['abstract'].apply(process_row)

mergingtext(df_test)


# write back to processed folder
df_test.to_pickle("data/processed/test_clean_df.pkl")