# INTRODUCTION
## __Aim:__ 
To build an article recommedation system using word embedding (Doc2Vec) and Cosine similarity.

## __Dataset:__ 
The Medium article dataset from kaggle was used. The dataset contains title, text from articles, and other information about the article.


## __Models:__
__Doc2Vec:__  Doc2Vec is a word embedding technique that computes a feature vector for every document in the corpus unlike Word2vec which compute vector for every word. <br/>
__Cosine Similarities:__ Cosine similarity is a metric used to measure the similarity of two vectors. it measures the cosine of the angle between two vectors projected in a multi-dimensional space.

# Data Wrangling

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/medium-articles/articles.csv


In [2]:
#import dependencies
#import dependencies
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
df = pd.read_csv('/kaggle/input/medium-articles/articles.csv')
df.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   author        337 non-null    object
 1   claps         337 non-null    object
 2   reading_time  337 non-null    int64 
 3   link          337 non-null    object
 4   title         337 non-null    object
 5   text          337 non-null    object
dtypes: int64(1), object(5)
memory usage: 15.9+ KB


In [5]:
df.isna().sum()

author          0
claps           0
reading_time    0
link            0
title           0
text            0
dtype: int64

In [6]:
#Combine article title and text into one column
df['text'] = df['title'] + ' ' + df['text']

In [7]:
#drop unneccesary columns
df.drop(['title', 'author', 'claps', 'reading_time', 'link'], axis=1, inplace = True)

In [8]:
#check for duplicated text
df.duplicated().sum()

107

In [9]:
#drop duplicate text
df.drop_duplicates(inplace = True)

In [10]:
#reset index
df.reset_index(drop=True, inplace=True)

In [11]:
df.tail()

Unnamed: 0,text
225,Stochastic Weight Averaging — a New Way to Get...
226,"Artificial Intelligence, AI in 2018 and beyond..."
227,"Spiking Neural Networks, the Next Generation o..."
228,Surprise! Neurons are Now More Complex than We...
229,“WTH does a neural network even learn??” — a n...


# Text Prepreocesssing

In [12]:
lemma = WordNetLemmatizer()
nltk.download('stopwords')
stopword_list = stopwords.words('english')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
#Define a function for preprocessing

def clean(text):
  #remove special characters
  text = re.sub("[^A-Za-z0-9 ]", " ", text)
  #convert text to lowercase
  text = text.lower()
  #tokenize text
  tokens = nltk.word_tokenize(text)
  #remove stopwords
  text_list = []
  for token in tokens:
    if token not in stopword_list:
      #lemmatization
      text_list.append(lemma.lemmatize(token))
  return ' '.join(text_list)

In [14]:
#apply clean function on the text column
df.text = df.text.apply(clean)

In [15]:
df.head()

Unnamed: 0,text
0,chatbots next big thing happened startup mediu...
1,python data science 8 concept may forgotten ev...
2,automated feature engineering python towards d...
3,machine learning go zero hero freecodecamp und...
4,reinforcement learning scratch insight data wa...


In [16]:
#compute a list of the texts
text_list = [list(i.split(' ')) for i in df.text]

In [17]:
#Tagging the documents
#Each set of words are mapped unique index.
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_data = [TaggedDocument(words = doc, tags=[i]) for i, doc in enumerate(text_list)]

In [18]:
tagged_data[0]

TaggedDocument(words=['chatbots', 'next', 'big', 'thing', 'happened', 'startup', 'medium', 'oh', 'headline', 'blared', 'chatbots', 'next', 'big', 'thing', 'hope', 'sky', 'high', 'bright', 'eyed', 'bushy', 'tailed', 'industry', 'ripe', 'new', 'era', 'innovation', 'time', 'start', 'socializing', 'machine', 'road', 'sign', 'pointed', 'towards', 'insane', 'success', 'mobile', 'world', 'congress', '2017', 'chatbots', 'main', 'headliner', 'conference', 'organizer', 'cited', 'overwhelming', 'acceptance', 'event', 'inevitable', 'shift', 'focus', 'brand', 'corporates', 'chatbots', 'fact', 'significant', 'question', 'around', 'chatbots', 'would', 'monopolize', 'field', 'whether', 'chatbots', 'would', 'take', 'first', 'place', 'one', 'year', 'answer', 'question', 'even', 'ecosystem', 'platform', 'dominate', 'chatbots', 'first', 'technological', 'development', 'talked', 'grandiose', 'term', 'slump', 'spectacularly', 'age', 'old', 'hype', 'cycle', 'unfolded', 'familiar', 'fashion', 'expectation', '

# Modelleing (Doc2vec)

In [19]:
#Doc2vec
model = Doc2Vec(tagged_data, min_count=1, workers=4)

# Cosine similarity 

In [20]:
# take the first article in the dataframe and vectorize it. 
article_rec_sim = df.text[0].split(' ')
article_vectorized = model.infer_vector(article_rec_sim)


# Calculate cosine similarity for top 5 most similar article 
sim_5_article = model.dv.most_similar(positive=[article_vectorized])[:5]

# INFERENCE

In [21]:
#get inference
def inference(similar_articles):
    sim_list = []
    for ind, value in enumerate(similar_articles):
        
    #get index of the similar articles
        index = value[0]
        
    #print the similar element according to rank, index, the article itself 
    #and the corresponding cosine similarities
        sim_list.append([ind + 1, df.text[index], value[1]])
    return (pd.DataFrame(sim_list, columns=["rank", "text", "cos_sim"]))

In [22]:
#get the first 5 articles similar to the article with index 0
inference(sim_5_article)

Unnamed: 0,rank,text,cos_sim
0,1,chatbots next big thing happened startup mediu...,0.958229
1,2,ui new ui startup medium rise ui le apps shoul...,0.937256
2,3,invisible interface going transform way intera...,0.928555
3,4,siri descendant intelligent assistant evolve i...,0.928407
4,5,looking ghost machine weird thing short ago wr...,0.917099
