# CS Research Paper Recommender

In this project, I will develop a that will recommend CS research papers based on the similarity between the input text and the research papers in the dataset.

In [1]:
from google.colab import drive        # Mounting google drive to google colab to import the datasets.
drive.mount("/content/drive/")

Mounted at /content/drive/


In [2]:
import numpy as np  # Importing necessary libraries
import pandas as pd
import re
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime
import nltk
from sklearn.metrics.pairwise import *
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from numba import jit
nltk.download("stopwords")
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
data_info = pd.read_csv('/content/drive/My Drive/data_info.csv')    # Importing datasets from drive to pandas dataframes.
data = pd.read_csv("/content/drive/My Drive/data_text.csv")

In [5]:
data_info.head()   # This Dataset consists of information about the research paper that can be used to access the research paper
                   # It will be useful when deploying the recommender system

Unnamed: 0,id,title,abstract,year,authors
0,2011.04896,Generalized LSTM-based End-to-End Text-Indepen...,The increasing amount of available data and ...,2020,Soroosh Tayebi Arasteh
1,2011.04862,On Efficient and Robust Metrics for RANSAC Hyp...,This paper focuses on developing efficient a...,2020,"Jiaqi Yang, Zhiqiang Huang, Siwen Quan, Qian Z..."
2,2011.04823,Language Through a Prism: A Spectral Approach ...,Language exhibits structure at different sca...,2020,"Alex Tamkin, Dan Jurafsky, Noah Goodman"
3,2011.04825,Multi-Agent Active Search using Realistic Dept...,The search for objects of interest in an unk...,2020,"Ramina Ghods, William J. Durkin, Jeff Schneider"
4,2011.04891,Dynamic Relay Selection and Power Allocation f...,Cooperative communication is an effective ap...,2020,"Yuanzhe Geng, Erwu Liu, Rui Wang, and Yiming Liu"


In [7]:
data.head()     # This dataset consists of the prerocessed and stemmed text and id required to build the 
                # recommendation system.

Unnamed: 0,id,preprocessed text,stemmed_text
0,2011.04896,generalized lstm based text independent speake...,general lstm base text independ speaker verif ...
1,2011.04862,efficient robust metrics ransac hypotheses rig...,effici robust metric ransac hypothes rigid reg...
2,2011.04823,language prism spectral approach multiscale la...,languag prism spectral approach multiscal lang...
3,2011.04825,agent active search using realistic depth awar...,agent activ search use realist depth awar nois...
4,2011.04891,dynamic relay selection power allocation minim...,dynam relay select power alloc minim outag pro...


In [4]:
tfidf = TfidfVectorizer()        # Vectorizing the document using TF-IDF
vectors_tfidf = tfidf.fit_transform(data['stemmed_text'])
vectors_tfidf.shape

(99998, 67362)

In [None]:
pca = TruncatedSVD(n_components=3000)  # Applying SVD to find the top 3000 features in order to reduce dimensionality and 
vectors = pca.fit_transform(vectors_tfidf)  # reduce the time taken to calculate similarity.
print(vectors.shape)

(99998, 3000)


In [None]:
def decontracted(phrase):
    '''
    Converts words with " ' " to full form.
    '''
    # specific
    phrase = re.sub(r"won't", "will not", phrase)  
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def text_to_vector(sentance):       
    """
    Converts text to final required vector.
    """
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords.words())
    stemmer = SnowballStemmer("english")
    stemmed_text = []
    for word in sentance.split():
        word = stemmer.stem(word)
        stemmed_text.append(word)
    sentance = " ".join(stemmed_text)
    sentance= tfidf.transform([sentance])
    return sentance


def recommendations(sentance,vectors_tfidf):
    """
    Gives recommendations after calculating similarity.
    """
    sentance = text_to_vector(sentance)
    sentance = pca.transform(sentance)
    similarities = []
    for i in tqdm(range(vectors_tfidf.shape[0])):
        similarities.append(paired_cosine_distances(sentance,vectors_tfidf[i].reshape(-1,3000))[0])
    similarities= np.array(similarities).argsort()
    top_rec = similarities[:10]
    for item in top_rec:
        print(data_info.iloc[item,:]['title'])
        print("\n")
    

In [None]:
sentance = 'Generalized LSTM-based End-to-End Text-Independent Speaker Verification  The increasing amount of available data and more affordable hardware\nsolutions have opened a gate to the realm of Deep Learning (DL). Due to the\nrapid advancements and ever-growing popularity of DL, it has begun to invade\nalmost every field, where machine learning is applicable, by altering the\ntraditional state-of-the-art methods. While many researchers in the speaker\nrecognition area have also started to replace the former state-of-the-art\nmethods with DL techniques, some of the traditional i-vector-based methods are\nstill state-of-the-art in the context of text-independent speaker verification\n(TI-SV). In this paper, we discuss the most recent generalized end-to-end\n(GE2E) DL technique based on Long Short-term Memory (LSTM) units for TI-SV by\nGoogle and compare different scenarios and aspects including utterance\nduration, training time, and accuracy to prove that our method outperforms the\ntraditional methods.\n'

In [None]:
recommendations(sentance,vectors)

HBox(children=(FloatProgress(value=0.0, max=99998.0), HTML(value='')))


Generalized LSTM-based End-to-End Text-Independent Speaker Verification


Deep learning methods in speaker recognition: a review


Scalable Deep Learning on Distributed Infrastructures: Challenges,
  Techniques and Tools


Text-Independent Speaker Verification Using 3D Convolutional Neural
  Networks


Model Asset eXchange: Path to Ubiquitous Deep Learning Deployment


Deep Speaker Embeddings for Far-Field Speaker Recognition on Short
  Utterances


Deep Learning for Source Code Modeling and Generation: Models,
  Applications and Challenges


An Orchestrated Empirical Study on Deep Learning Frameworks and
  Platforms


An Orchestrated Empirical Study on Deep Learning Frameworks and
  Platforms


Using Deep Learning to Improve Ensemble Smoother: Applications to
  Subsurface Characterization


