In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from tqdm import tqdm
import itertools
import nltk
from nltk.corpus import stopwords
from fastai.text.all import *

In [2]:
data = pd.read_csv('../input/ire-major-project/SDP_train.csv')

In [3]:
data.shape

In [4]:
data.head()

In [5]:
# data['new_context'] = data['citing_title'] +'. ' + data['cited_title'] + '. ' + data['citation_context']

In [6]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('punkt')
stopwords = stopwords.words('english')

class Preprocessing():
    def __init__(self):
        pass
    
    def remove_brackets(self, text):

        text = re.sub(r'\([^)]*\)', '', text)
        return text
    
    def tokenize_text(self, text):
        
        tokenized_text = word_tokenize(text)
        return tokenized_text
    
    def remove_stopwords(self, text):

        final_text = [word for word in text if word not in stopwords]
        return final_text
    
    def lemmatize(self, text):

        lemmatized_text = [lemmatizer.lemmatize(word) for word in text]
        return lemmatized_text
    
    def preprocess(self, text):

        text = text.replace('#AUTHOR_TAG', ' ')
        text = self.remove_brackets(text)
        tokenized_text = self.tokenize_text(text)
        filtered_text = self.remove_stopwords(tokenized_text)
        final_text = self.lemmatize(filtered_text)

        return ' '.join(final_text)

In [7]:
preprocessed_data = []
preprocessor = Preprocessing()

for i, row in tqdm(data.iterrows()):
    final_text = preprocessor.preprocess(row['citation_context'])
    preprocessed_data.append(final_text)

In [8]:
data['input'] = final_text
data.to_csv('train_final.csv', index=False)

In [53]:
df_tok,count = tokenize_df(pd.read_csv('train_final.csv'), ['input'])

data_lm = TextDataLoaders.from_df(df_tok, path='./',
    vocab=make_vocab(count), text_col='text', is_lm=True)

In [54]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)
#find the optimal learning rate & visualize it
learn.lr_find()
# learn.recorder.plot()

In [55]:
learn.fit_one_cycle(10, 2e-3, moms=(0.8,0.7,0.8))

In [56]:
learn.unfreeze()
learn.fit_one_cycle(3, 2e-3, moms=(0.8,0.7,0.8))

In [57]:
learn.save_encoder('fine_tuned_enc')

In [58]:
dls_clas = TextDataLoaders.from_df(df_tok, path='.',
    vocab=make_vocab(count), text_col='text', label_col='citation_influence_label')

learn = text_classifier_learner(dls_clas, AWD_LSTM, metrics=accuracy)

In [59]:
learn.load_encoder('fine_tuned_enc')

In [60]:
learn.lr_find()

In [61]:
learn.fit_one_cycle(5, 3e-3, moms=(0.8,0.7,0.8))

In [62]:
learn.freeze_to(-2)
learn.fit_one_cycle(2, 3e-3, moms=(0.8,0.7,0.8))

In [63]:
learn.freeze_to(-3)
learn.fit_one_cycle(2, 3e-3, moms=(0.8,0.7,0.8))

In [64]:
learn.unfreeze()
learn.lr_find()

In [65]:
learn.fit_one_cycle(1, 1e-3, moms=(0.8,0.7,0.8))
# learn.save('fwd_clas')

In [66]:
defaults.device = torch.device('cpu')
learn.model.eval()
learn.export()

In [67]:
learn = load_learner('/kaggle/working/export.pkl')

In [68]:
data = pd.read_csv('../input/ire-major-project/SDP_test.csv')

In [69]:
data.head(2)

In [73]:
res = []
for i, row in data.iterrows():
#     print(learn.predict(preprocessor.preprocess(row['citation_context'])[0]))
#     break
    res.append(int(learn.predict(preprocessor.preprocess(row['citation_context']))[0]))

In [74]:
submission = pd.DataFrame()
submission['unique_id'] = data['unique_id']
submission['citation_influence_label'] = res

In [75]:
submission.to_csv('submission.csv', index=False)

In [None]:
!nvidia-smi