In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Text_Similarity_Dataset.csv')

In [3]:
data

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...
5,5,india seeks to boost construction india has cl...,music mogul fuller sells company pop idol supr...
6,6,podcasters look to net money nasa is doing it...,ukip outspent labour on eu poll the uk indepen...
7,7,row over police power for csos the police fe...,ban on hunting comes into force fox hunting wi...
8,8,election could be terror target terrorists m...,nhs waiting time target is cut hospital waitin...
9,9,japan economy slides to recession the japanese...,optimism remains over uk housing the uk proper...


In [4]:
data['text1'].iloc[0]

'savvy searchers fail to spot ads internet search engine users are an odd mix of naive and sophisticated  suggests a report into search habits.  the report by the us pew research center reveals that 87% of searchers usually find what they were looking for when using a search engine. it also shows that few can spot the difference between paid-for results and organic ones. the report reveals that 84% of net users say they regularly use google  ask jeeves  msn and yahoo when online.  almost 50% of those questioned said they would trust search engines much less  if they knew information about who paid for results was being hidden. according to figures gathered by the pew researchers the average users spends about 43 minutes per month carrying out 34 separate searches and looks at 1.9 webpages for each hunt. a significant chunk of net users  36%  carry out a search at least weekly and 29% of those asked only look every few weeks. for 44% of those questioned  the information they are looking

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4023 entries, 0 to 4022
Data columns (total 3 columns):
Unique_ID    4023 non-null int64
text1        4023 non-null object
text2        4023 non-null object
dtypes: int64(1), object(2)
memory usage: 94.4+ KB


## Training the model :-

In [6]:
import gensim.downloader as api

In [7]:
samp_data = pd.read_csv('reddit_worldnews_start_to_2016-11-22.csv')

In [8]:
samp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509236 entries, 0 to 509235
Data columns (total 8 columns):
time_created    509236 non-null int64
date_created    509236 non-null object
up_votes        509236 non-null int64
down_votes      509236 non-null int64
title           509236 non-null object
over_18         509236 non-null bool
author          509236 non-null object
subreddit       509236 non-null object
dtypes: bool(1), int64(3), object(4)
memory usage: 27.7+ MB


In [9]:
samp_data.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [10]:
type(samp_data['title'])

pandas.core.series.Series

In [11]:
samp_list = samp_data['title'].tolist()

In [12]:
len(samp_list)

509236

In [13]:
samp_list

['Scores killed in Pakistan clashes',
 'Japan resumes refuelling mission',
 'US presses Egypt on Gaza border',
 'Jump-start economy: Give health care to all ',
 'Council of Europe bashes EU&UN terror blacklist',
 'Hay presto! Farmer unveils the  illegal  mock-Tudor castle he tried to hide behind 40ft hay bales',
 'Strikes, Protests and Gridlock at the Poland-Ukraine Border',
 'The U.N. Mismanagement Program',
 'Nicolas Sarkozy threatens to sue Ryanair ',
 'US plans for missile shields in Polish town met with resistance [video]',
 'Archbishop of Canterbury calls for new law to punish  thoughtless or cruel  words -Times Online',
 'Top US Envoy: Violence In Kenya Is  Ethnic Cleansing ',
 'Team building float to commemorate the Holocaust in Rio De Jinero has one hell of an idea.',
 'Migrant workers told to abandon Lunar New Year holiday plans',
 ' Sarkozy, Girlfriend Sue Ryanair Over Ad',
 'Nicolas Sarkozy, Angela Merkel confirm their opposition to Turkey being EU membership',
 'Mass Evacu

### Text cleaning :-

In [14]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn

In [15]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [16]:
stop_words = set(stopwords.words('english'))

In [17]:
lemmatizer = WordNetLemmatizer()  

In [18]:
sen = []
for i in samp_list:
    tokenn = word_tokenize(i)
    tokenn = [w.lower() for w in tokenn]
    words = [word for word in tokenn if word.isalpha()]
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    sen.append(words)

In [19]:
sen

[['score', 'killed', 'pakistan', 'clash'],
 ['japan', 'resume', 'refuelling', 'mission'],
 ['u', 'press', 'egypt', 'gaza', 'border'],
 ['economy', 'give', 'health', 'care'],
 ['council', 'europe', 'bash', 'eu', 'un', 'terror', 'blacklist'],
 ['hay',
  'presto',
  'farmer',
  'unveils',
  'illegal',
  'castle',
  'tried',
  'hide',
  'behind',
  'hay',
  'bale'],
 ['strike', 'protest', 'gridlock', 'border'],
 ['mismanagement', 'program'],
 ['nicolas', 'sarkozy', 'threatens', 'sue', 'ryanair'],
 ['u',
  'plan',
  'missile',
  'shield',
  'polish',
  'town',
  'met',
  'resistance',
  'video'],
 ['archbishop',
  'canterbury',
  'call',
  'new',
  'law',
  'punish',
  'thoughtless',
  'cruel',
  'word',
  'online'],
 ['top', 'u', 'envoy', 'violence', 'kenya', 'ethnic', 'cleansing'],
 ['team',
  'building',
  'float',
  'commemorate',
  'holocaust',
  'rio',
  'de',
  'jinero',
  'one',
  'hell',
  'idea'],
 ['migrant',
  'worker',
  'told',
  'abandon',
  'lunar',
  'new',
  'year',
  'hol

## Converting text data into Doc2Vec :-

In [20]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [21]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sen)]

In [22]:
model = Doc2Vec(documents, vector_size=100)

## Calculating cosine similarity :-

#### Example :-

In [23]:
#ss = ['Cats are beautiful animals','Dogs are awesome']
ss = ['Cats are beautiful animals','Dolphins are swimming mammals']
#ss = ['Some gorgeous creatures are felines.','Cats are beautiful animals']
#ss = ['Cats are beautiful animals','Human walk on the pedal']
#ss = ['Cats are beautiful animals','cats are awesome too']

In [24]:
sen1 = []
for i in ss:
    tokenn = word_tokenize(i)
    tokenn = [w.lower() for w in tokenn]
    words = [word for word in tokenn if word.isalpha()]
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    sen1.append(words)

In [25]:
sen1

[['cat', 'beautiful', 'animal'], ['dolphin', 'swimming', 'mammal']]

In [26]:
model.n_similarity(sen1[0],sen1[1])

  """Entry point for launching an IPython kernel.


0.794842

### Applying on our dataset :-

In [None]:
#model.n_similarity(sen0[0],sen0[1])
#1-score

In [27]:
data.iloc[0]

Unique_ID                                                    0
text1        savvy searchers fail to spot ads internet sear...
text2        newcastle 2-1 bolton kieron dyer smashed home ...
Name: 0, dtype: object

In [28]:
data.iloc[4022]['text1']



In [101]:
sen_a = []
sen_b = []
score_list = []
for i in range(4023):
    # for text1
    sentence_a = data.iloc[i]['text1']
    tokenn = word_tokenize(sentence_a)
    tokenn = [w.lower() for w in tokenn]
    words = [word for word in tokenn if word.isalpha()]
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    sen_a.append(words)
    # for text2
    sentence_b = data.iloc[i]['text2']
    tokenn_b = word_tokenize(sentence_b)
    tokenn_b = [w.lower() for w in tokenn_b]
    words_b = [word for word in tokenn_b if word.isalpha()]
    words_b = [w for w in words_b if not w in stop_words]  
    words_b = [lemmatizer.lemmatize(w) for w in words_b]
    sen_b.append(words_b)
    # removing difficult words
    for word in list(sen_a[0]):
        if word not in list(model.wv.vocab):
            sen_a[0].remove(word) 
            
    for word in list(sen_b[0]):
        if word not in list(model.wv.vocab):
            sen_b[0].remove(word) 
            
    # Scoring 
    if (len(sen_a[0]) == 0) or (len(sen_b[0])==0):
        score = 1
    else:    
        score = 1-model.n_similarity(sen_a[0],sen_b[0])
    #1-score
    score_list.append(score)
    sen_a = []
    sen_b = []



In [102]:
score_list

[0.34884113073349,
 0.27900516986846924,
 0.2604762315750122,
 0.14642560482025146,
 0.20535069704055786,
 0.30517488718032837,
 0.5234251618385315,
 0.18254899978637695,
 0.3674554228782654,
 0.09298968315124512,
 0.3957188129425049,
 0.25637853145599365,
 0.4043685793876648,
 0.1963912844657898,
 0.18849390745162964,
 0.2627444863319397,
 0.40299129486083984,
 0.3001595735549927,
 0.09910523891448975,
 0.20932269096374512,
 0.360012412071228,
 0.3246532678604126,
 0.2267633080482483,
 0.46149104833602905,
 0.3684961795806885,
 0.203576922416687,
 0.21206659078598022,
 0.14865374565124512,
 0.3707279562950134,
 0.16357433795928955,
 0.5820660591125488,
 0.7069687843322754,
 0.21741390228271484,
 0.23947691917419434,
 0.39390695095062256,
 0.46647173166275024,
 0.14691424369812012,
 0.268889844417572,
 0.593011885881424,
 0.35923153162002563,
 0.31912457942962646,
 0.5360194742679596,
 0.355324387550354,
 0.1956331729888916,
 0.439319372177124,
 0.1361762285232544,
 0.41796743869781494

In [103]:
id = []
for i in range(4023):
    id.append(i)

In [104]:
output = pd.DataFrame(id,columns=['Unique_id'])

In [105]:
output['similarity score'] = score_list

In [106]:
output.to_csv('Result.csv')