In [2]:
import os
import pandas as pd
import numpy as np

from scipy.stats import randint
from scipy.spatial import distance_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from io import StringIO

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel

from pathlib import Path  
import glob

# import spacy
from spacy.language import Language
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
from spacy.lang.pt.stop_words import STOP_WORDS as pt_stop
from spacy.lang.ro.stop_words import STOP_WORDS as ro_stop
from spacy_langdetect import LanguageDetector

import altair as alt
from langdetect import detect, detect_langs
import pdftotext

import requests
from bs4 import BeautifulSoup
import urllib.request

# Web-scraping the theses PDF files and PDF to TXT conversion

In [58]:
url = 'https://tel.archives-ouvertes.fr/tel-03440181/document'
r = requests.get(url, stream=True)

with open('metadata.pdf', 'wb') as fd:
    for chunk in r.iter_content(1):
        fd.write(chunk)

In [40]:
link="https://tel.archives-ouvertes.fr/search/index/?q=%2A&sort=submittedDate_tdate+desc&rows=50"
cpt=0
pages=[]
for i in range (1):
    pages.append(link)
total_pages=[]
for i in pages:
    cpt+=1
    print(cpt)
    r = requests.get(i) 
    soup = BeautifulSoup(r.text, 'html.parser')
    links=[]#the list of links
    for i in soup.find_all('a',{'class':'ref-halid'}):
        links.append(i.get_text()) 


    total_pages+=links

In [41]:
for i in total_pages:
    url = 'https://tel.archives-ouvertes.fr/'+i+'/document'
    r = requests.get(url, stream=True)

    with open('TelPDF\\'+str(i)+'.pdf', 'wb') as fd:
        for chunk in r.iter_content(1):
            fd.write(chunk)

In [43]:
count=0
for i in total_pages:

    url = 'https://tel.archives-ouvertes.fr/'+i+'/document'
    urllib.request.urlretrieve(url, "{}.pdf".format(i))
    print(count)
    count +=1

In [49]:
filesNames = ['tel-03440321v1','tel-03440181v1','tel-03440058v1','tel-03439538v1',
              'tel-03439366v1','tel-03439358v1','tel-03439354v1','tel-03439346v1','tel-03439261v1',
              'tel-03438938v1','tel-03438925v1','tel-03438923v1','tel-03438921v1','tel-03438863v1',
              'tel-03438829v1','tel-03438828v1','tel-03438811v1','tel-03438755v1','tel-03438105v1',
              'tel-03438104v1','tel-03438103v1','tel-03438102v1','tel-03438101v1','tel-03438100v1',
              'tel-03437616v1','tel-03437573v1','tel-03437572v1','tel-01689242v1','tel-03412908v1',
              'tel-03437282v1','tel-03437096v1','tel-03437063v1','tel-03096870v2','tel-03436551v1',
              'tel-03436548v1','tel-03436545v1','tel-03436542v2','tel-03436530v1','tel-03436527v1',
              'tel-03436501v1','tel-03436409v1','tel-03436405v1','tel-03436394v1','tel-03436372v1',
              'tel-03436368v1','tel-03436364v1','tel-03436335v1','tel-03436173v1','tel-03436157v1']

In [50]:
url1='https://tel.archives-ouvertes.fr/'
cpt=0
final=[]
for i in filesNames:
    url = url1+i
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser')
    title=''
    if (soup.find('meta',attrs={'name':"DC.title"})):
        title=(soup.find('meta',attrs={'name':"DC.title"}).get('content'))
    elif (soup.find('meta',attrs={'name':"citation_title"})):
        title=(soup.find('meta',attrs={'name':"citation_title"}).get('content'))
    else:
        title=''
    final.append(title)

In [51]:
titles=final
titles[:5]

['Education, Well-Being and Aspirations ; a Capability based Analysis of the Secondary Schooling System in France',
 "Algorithmes multi-critères pour la prédiction de structures d'ARN",
 'La microfinance et appui au microentrepreneuriat : cas du Burundi',
 "Importance de la re-domestication pour la conservation de l'agrobiodiversité : le cas du châtaignier",
 'Homomorphic Cryptography and Privacy']

In [62]:
for i in filesNames:
    with open('TelPDF\\'+i+".pdf", "rb") as f:
        pdf = pdftotext.PDF(f)

    # Save all text to a txt file.
    with open('TelTXT\\'+i+'.txt', 'w',encoding = 'utf-8') as f:
        f.write("\n\n".join(pdf))

In [64]:
# dataset = []
# file = open("TelTXT//tel-03412908v1.txt", 'r')
# text = file.read().strip()
# file.close()

# for j in range(len(file)):
#     dataset.append((str(i) + str(file_name[j]), file_title[j]))

In [65]:
# with open("TelTXT//tel-03412908v1.txt", "r",encoding="utf-8") as f:
#     #read file as text and store in variable `text`
#     text = f.read()

# Language detection of documents

In [52]:
languages = []
for i in filesNames:
    with open("TelTXT//"+i+'.txt', "r",encoding="utf-8") as f:
        #read file as text and store in variable `text`
        text = f.read()
    languages.append(str(detect_langs(text))[1:-1].split()[0].split(':')[0])

In [53]:
langs=pd.DataFrame(filesNames,columns=['File Name'])
langs['Title']= titles
langs['Language']=languages
langs.head()

Unnamed: 0,File Name,Title,Language
0,tel-03440321v1,"Education, Well-Being and Aspirations ; a Capa...",en
1,tel-03440181v1,Algorithmes multi-critères pour la prédiction ...,fr
2,tel-03440058v1,La microfinance et appui au microentrepreneuri...,fr
3,tel-03439538v1,Importance de la re-domestication pour la cons...,fr
4,tel-03439366v1,Homomorphic Cryptography and Privacy,fr


In [56]:
filesNames = ['tel-03440321v1',
             'tel-03440181v1',
             'tel-03440058v1',
             'tel-03439538v1',
             'tel-03439366v1',
             'tel-03439358v1',
             'tel-03439354v1',
             'tel-03439346v1',
             'tel-03439261v1',
             'tel-03438938v1']
x=langs[langs['File Name'].isin(filesNames)]
x['Node']= [0,1,2,3,4,5,6,7,8,9]
x[['Node','File Name','Title']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Node']= [0,1,2,3,4,5,6,7,8,9]


Unnamed: 0,Node,File Name,Title
0,0,tel-03440321v1,"Education, Well-Being and Aspirations ; a Capa..."
1,1,tel-03440181v1,Algorithmes multi-critères pour la prédiction ...
2,2,tel-03440058v1,La microfinance et appui au microentrepreneuri...
3,3,tel-03439538v1,Importance de la re-domestication pour la cons...
4,4,tel-03439366v1,Homomorphic Cryptography and Privacy
5,5,tel-03439358v1,Observer-based boundary control of distributed...
6,6,tel-03439354v1,In search of frictions
7,7,tel-03439346v1,Le droit face à l'art corporel : du corps comm...
8,8,tel-03439261v1,Récupérateur d’énergie pour les systèmes TPMS ...
9,9,tel-03438938v1,Rôle des contacts adhésifs et frottants dans l...


## Another approach for language detection

In [57]:
languages = []
for i in filesNames:
    with open("TelTXT//"+i+'.txt', "r",encoding="utf-8") as f:
        #read file as text and store in variable `text`
        text = f.read()
    languages.append(detect(text))

In [58]:
languages[:5]

['en', 'fr', 'fr', 'fr', 'fr']

In [59]:
directory_path = "TelTXT//"
text_files = glob.glob(f"{directory_path}/*.txt")

In [60]:
text_files[:5]

['TelTXT\\tel-01689242v1.txt',
 'TelTXT\\tel-03096870v2.txt',
 'TelTXT\\tel-03412908v1.txt',
 'TelTXT\\tel-03436157v1.txt',
 'TelTXT\\tel-03436173v1.txt']

In [61]:
text_titles = [Path(text).stem for text in text_files]

In [62]:
text_titles[:5]

['tel-01689242v1',
 'tel-03096870v2',
 'tel-03412908v1',
 'tel-03436157v1',
 'tel-03436173v1']

In [63]:
final_stopwords_list = list(fr_stop) + list(en_stop)+list(pt_stop)+list(ro_stop)

In [64]:
tfidf_vectorizer = TfidfVectorizer(input='filename',stop_words=final_stopwords_list)

# TFIDF for one file

In [65]:
tfidf_vector = tfidf_vectorizer.fit_transform(["TelTXT\\tel-03440321v1.txt"])




In [66]:
tfidf_vector

<1x6221 sparse matrix of type '<class 'numpy.float64'>'
	with 6221 stored elements in Compressed Sparse Row format>

In [67]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=["tel-03440321v1.txt"], columns=tfidf_vectorizer.get_feature_names())




In [68]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()


In [69]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')


In [70]:
tfidf_df.stack().reset_index().head()


Unnamed: 0,level_0,level_1,0
0,tel-03440321v1.txt,0,0.00158
1,tel-03440321v1.txt,1,0.011848
2,tel-03440321v1.txt,2,0.00079
3,tel-03440321v1.txt,6,0.00079
4,tel-03440321v1.txt,1,0.00079


In [71]:
tfidf_df = tfidf_df.stack().reset_index()


In [72]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})


In [73]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000250026A3940>

In [74]:
top_tfidf= tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])
top_tfidf.head(5)

Unnamed: 0,document,term,tfidf
922,tel-03440321v1.txt,aspirations,0.372012
2208,tel-03440321v1.txt,education,0.3341
866,tel-03440321v1.txt,approach,0.206936
1243,tel-03440321v1.txt,capability,0.205357
5156,tel-03440321v1.txt,school,0.199828


In [75]:
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])


In [76]:
tfidf_df.shape

(6221, 3)

# TFIDF on the 20 files

In [77]:
text_titles = [Path(text).stem for text in text_files[:20]]

In [78]:
# tfidf_vectorizer = TfidfVectorizer(input='filename',stop_words=final_stopwords_list)

In [79]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files[:20])

In [80]:
tfidf_vector

<20x95153 sparse matrix of type '<class 'numpy.float64'>'
	with 167896 stored elements in Compressed Sparse Row format>

In [81]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles[:20], columns=tfidf_vectorizer.get_feature_names())




In [82]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()


In [83]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')


In [84]:
tfidf_df.stack().reset_index().head()


Unnamed: 0,level_0,level_1,0
0,tel-01689242v1,0,0.004497
1,tel-01689242v1,0,0.062952
2,tel-01689242v1,0,0.0
3,tel-01689242v1,1,0.0
4,tel-01689242v1,2,0.0


In [85]:
tfidf_df = tfidf_df.stack().reset_index()


In [86]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})


In [87]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002507E53E790>

In [88]:
top_tfidf= tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(5)

In [89]:
(top_tfidf).head(15)


Unnamed: 0,document,term,tfidf
72101,tel-01689242v1,rsn,0.388606
8278,tel-01689242v1,adoption,0.2707
15574,tel-01689242v1,bibliothèques,0.263537
15573,tel-01689242v1,bibliothèque,0.260701
15576,tel-01689242v1,bibliothécaires,0.254455
188965,tel-03096870v2,λn,0.272729
112145,tel-03096870v2,c2,0.268601
180285,tel-03096870v2,tr,0.252564
137936,tel-03096870v2,hℓ,0.22418
186001,tel-03096870v2,zn,0.186875


In [90]:
pd.merge(top_tfidf,langs[['File Name','Title']],left_on='document',right_on='File Name')[['document','Title','term','tfidf']].head(15)

Unnamed: 0,document,Title,term,tfidf
0,tel-01689242v1,Determinants of the professional adoption of d...,rsn,0.388606
1,tel-01689242v1,Determinants of the professional adoption of d...,adoption,0.2707
2,tel-01689242v1,Determinants of the professional adoption of d...,bibliothèques,0.263537
3,tel-01689242v1,Determinants of the professional adoption of d...,bibliothèque,0.260701
4,tel-01689242v1,Determinants of the professional adoption of d...,bibliothécaires,0.254455
5,tel-03096870v2,Asymptotic representation theory and applicati...,λn,0.272729
6,tel-03096870v2,Asymptotic representation theory and applicati...,c2,0.268601
7,tel-03096870v2,Asymptotic representation theory and applicati...,tr,0.252564
8,tel-03096870v2,Asymptotic representation theory and applicati...,hℓ,0.22418
9,tel-03096870v2,Asymptotic representation theory and applicati...,zn,0.186875


In [91]:
set(top_tfidf.document)

{'tel-01689242v1',
 'tel-03096870v2',
 'tel-03412908v1',
 'tel-03436157v1',
 'tel-03436173v1',
 'tel-03436335v1',
 'tel-03436364v1',
 'tel-03436368v1',
 'tel-03436372v1',
 'tel-03436394v1',
 'tel-03436405v1',
 'tel-03436409v1',
 'tel-03436501v1',
 'tel-03436527v1',
 'tel-03436530v1',
 'tel-03436542v2',
 'tel-03436545v1',
 'tel-03436548v1',
 'tel-03436551v1',
 'tel-03437063v1'}

In [92]:
tfidf_df.shape

(1903060, 3)

In [95]:
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)
# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)
# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N'
)
# display the three superimposed visualizations
(heatmap  + text).properties(width = 600)

# TFIDF on the 50 files

In [96]:
text_titles = [Path(text).stem for text in text_files]

In [97]:
# tfidf_vectorizer = TfidfVectorizer(input='filename',stop_words=final_stopwords_list)

In [98]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [99]:
tfidf_vector

<50x169025 sparse matrix of type '<class 'numpy.float64'>'
	with 393887 stored elements in Compressed Sparse Row format>

In [100]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names())




In [101]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()


In [102]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')


In [103]:
tfidf_df.stack().reset_index().head()


Unnamed: 0,level_0,level_1,0
0,tel-01689242v1,0,0.003609
1,tel-01689242v1,0,0.051141
2,tel-01689242v1,0,0.0
3,tel-01689242v1,0,0.0
4,tel-01689242v1,97,0.0


In [104]:
tfidf_df = tfidf_df.stack().reset_index()


In [105]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})


In [106]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002500AFCDAC0>

In [107]:
top_tfidf= tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(5)

In [108]:
(top_tfidf).head(15)


Unnamed: 0,document,term,tfidf
132350,tel-01689242v1,rsn,0.446087
28500,tel-01689242v1,bibliothèques,0.290581
28502,tel-01689242v1,bibliothécaires,0.283913
15727,tel-01689242v1,adoption,0.272461
28499,tel-01689242v1,bibliothèque,0.248328
335996,tel-03096870v2,λn,0.319893
248615,tel-03096870v2,hℓ,0.262949
271434,tel-03096870v2,mills,0.209684
321595,tel-03096870v2,tr,0.190259
200666,tel-03096870v2,c2,0.188038


In [109]:
pd.merge(top_tfidf,langs[['File Name','Title']],left_on='document',right_on='File Name')[['document','Title','term','tfidf']].head(15)

Unnamed: 0,document,Title,term,tfidf
0,tel-01689242v1,Determinants of the professional adoption of d...,rsn,0.446087
1,tel-01689242v1,Determinants of the professional adoption of d...,bibliothèques,0.290581
2,tel-01689242v1,Determinants of the professional adoption of d...,bibliothécaires,0.283913
3,tel-01689242v1,Determinants of the professional adoption of d...,adoption,0.272461
4,tel-01689242v1,Determinants of the professional adoption of d...,bibliothèque,0.248328
5,tel-03096870v2,Asymptotic representation theory and applicati...,λn,0.319893
6,tel-03096870v2,Asymptotic representation theory and applicati...,hℓ,0.262949
7,tel-03096870v2,Asymptotic representation theory and applicati...,mills,0.209684
8,tel-03096870v2,Asymptotic representation theory and applicati...,tr,0.190259
9,tel-03096870v2,Asymptotic representation theory and applicati...,c2,0.188038


In [110]:
tfidf_df.shape

(8451250, 3)

In [112]:
# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)
# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)
# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N'
)
# display the three superimposed visualizations
(heatmap  + text).properties(width = 600)

# N-grams

## Bigrams

In [278]:
text_titles = [Path(text).stem for text in text_files[:10]]

In [279]:
tfidf_vectorizer = TfidfVectorizer(input='filename',stop_words=final_stopwords_list,ngram_range=(2,2))

In [280]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files[:10])

  % sorted(inconsistent)


In [281]:
tfidf_vector

<10x373248 sparse matrix of type '<class 'numpy.float64'>'
	with 378519 stored elements in Compressed Sparse Row format>

In [282]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names())



In [283]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [284]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')


In [285]:
tfidf_df.stack().reset_index().head()


Unnamed: 0,level_0,level_1,0
0,tel-01689242v1,00 00,0.001802
1,tel-01689242v1,00 01,0.0
2,tel-01689242v1,00 025,0.0
3,tel-01689242v1,00 05,0.0
4,tel-01689242v1,00 11,0.001517


In [286]:
tfidf_df = tfidf_df.stack().reset_index()


In [287]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})


In [288]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001FCC7D05F98>

In [289]:
top_tfidf= tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])

In [290]:
top_tfidf.head(5).head(5)

Unnamed: 0,document,term,tfidf
288046,tel-01689242v1,réseaux sociaux,0.368825
304930,tel-01689242v1,sociaux numériques,0.358015
178041,tel-01689242v1,intention adoption,0.330708
30303,tel-01689242v1,adoption rsn,0.329191
141720,tel-01689242v1,facilité utilisation,0.159286


## Trigrams

In [327]:
text_titles = [Path(text).stem for text in text_files[:5]]

In [328]:
tfidf_vectorizer = TfidfVectorizer(input='filename',stop_words=final_stopwords_list,ngram_range=(3,3))

In [329]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files[:5])

  % sorted(inconsistent)


In [330]:
tfidf_vector

<5x118519 sparse matrix of type '<class 'numpy.float64'>'
	with 118684 stored elements in Compressed Sparse Row format>

In [331]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names())



In [332]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [333]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')


In [334]:
tfidf_df.stack().reset_index().head()


Unnamed: 0,level_0,level_1,0
0,tel-01689242v1,00 00 00,0.001908
1,tel-01689242v1,00 00 01,0.0
2,tel-01689242v1,00 00 15,0.0
3,tel-01689242v1,00 00 73,0.002365
4,tel-01689242v1,00 00 gmt,0.0


In [335]:
tfidf_df = tfidf_df.stack().reset_index()


In [336]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})


In [337]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001FC8FF76828>

In [338]:
top_tfidf= tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])

In [339]:
top_tfidf.head(5).head(10)

Unnamed: 0,document,term,tfidf
92561,tel-01689242v1,réseaux sociaux numériques,0.548699
60632,tel-01689242v1,intention adoption rsn,0.331112
47572,tel-01689242v1,facilité utilisation perçue,0.189207
29799,tel-01689242v1,compatibilité tâche technologie,0.160826
14850,tel-01689242v1,adoption réseaux sociaux,0.115889
224486,tel-03096870v2,tr hℓ tr,0.200508
174932,tel-03096870v2,hℓ tr hℓ,0.19179
194986,tel-03096870v2,orie yang mills,0.165637
187562,tel-03096870v2,mesure yang mills,0.148201
233570,tel-03096870v2,yang mills dimensions,0.139484


## Quadrigrams

In [340]:
text_titles = [Path(text).stem for text in text_files[:5]]

In [341]:
tfidf_vectorizer = TfidfVectorizer(input='filename',stop_words=final_stopwords_list,ngram_range=(4,4))

In [342]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files[:5])

  % sorted(inconsistent)


In [343]:
tfidf_vector

<5x127217 sparse matrix of type '<class 'numpy.float64'>'
	with 127360 stored elements in Compressed Sparse Row format>

In [344]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names())



In [345]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [346]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')


In [347]:
tfidf_df.stack().reset_index().head()


Unnamed: 0,level_0,level_1,0
0,tel-01689242v1,00 00 00 01,0.0
1,tel-01689242v1,00 00 00 73,0.003667
2,tel-01689242v1,00 00 01 00,0.0
3,tel-01689242v1,00 00 01 01,0.0
4,tel-01689242v1,00 00 01 contact,0.0


In [348]:
tfidf_df = tfidf_df.stack().reset_index()


In [349]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})


In [350]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001FCC81C7748>

In [351]:
top_tfidf= tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])

In [353]:
top_tfidf.head(5).head(15)

Unnamed: 0,document,term,tfidf
15493,tel-01689242v1,adoption réseaux sociaux numériques,0.176019
117909,tel-01689242v1,utilisation réseaux sociaux numériques,0.12468
28178,tel-01689242v1,chapitre présentation résultats enquête,0.110012
98994,tel-01689242v1,réseaux sociaux numériques bibliothécaires,0.088009
85615,tel-01689242v1,perçue intention adoption rsn,0.077008
240774,tel-03096870v2,tr hℓ tr hℓ,0.235362
209084,tel-03096870v2,orie yang mills dimensions,0.165823
155384,tel-03096870v2,chapitre orie yang mills,0.149776
222868,tel-03096870v2,repre sentations irre ductibles,0.12303
152930,tel-03096870v2,c2 λn c2 λn,0.106983


# Distance matrix of cosine similarities between chosen 10 theses documents

In [305]:
filesNames = ['tel-03440321v1',
 'tel-03440243v1',
 'tel-03440181v1',
 'tel-03440058v1',
 'tel-03439538v1',
 'tel-03439366v1',
 'tel-03439358v1',
 'tel-03439354v1',
 'tel-03439346v1',
 'tel-03439261v1']

In [306]:
directory_path = "TelTXT//"

text_files = glob.glob(f"{directory_path}/*.txt")
text_files = text_files[:10]

In [307]:
text_titles = [Path(text).stem for text in text_files]

In [308]:
tfidf_vectorizer = TfidfVectorizer(input='filename',stop_words=final_stopwords_list)

In [309]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

  % sorted(inconsistent)


In [310]:
tfidf_vector

<10x66650 sparse matrix of type '<class 'numpy.float64'>'
	with 91176 stored elements in Compressed Sparse Row format>

In [311]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names())




In [312]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()


In [313]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')


In [314]:
tfidf_df.stack().reset_index().head()


Unnamed: 0,level_0,level_1,0
0,tel-01689242v1,00,0.004351
1,tel-01689242v1,000,0.071359
2,tel-01689242v1,00001,0.0
3,tel-01689242v1,00001l,0.0
4,tel-01689242v1,00005,0.0


In [315]:
tfidf_df = tfidf_df.stack().reset_index()


In [316]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})


In [317]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001FCC81C0390>

In [318]:
top_tfidf= tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document'])

In [319]:
top_tfidf.head().head(15)

Unnamed: 0,document,term,tfidf
49795,tel-01689242v1,rsn,0.332799
3731,tel-01689242v1,adoption,0.304953
9134,tel-01689242v1,bibliothèques,0.250116
9133,tel-01689242v1,bibliothèque,0.236304
9136,tel-01689242v1,bibliothécaires,0.225328
76569,tel-03096870v2,c2,0.284877
133164,tel-03096870v2,λn,0.25426
126587,tel-03096870v2,tr,0.252357
95424,tel-03096870v2,hℓ,0.208999
130422,tel-03096870v2,yang,0.192379


In [320]:
cosine_similarities = linear_kernel(tfidf_vector[0:1], tfidf_vector).flatten()
cosine_similarities

array([1.        , 0.0354408 , 0.02171873, 0.03081323, 0.0520659 ,
       0.07895018, 0.06078642, 0.10336703, 0.11482271, 0.00701532])

In [321]:
cosine_similarities = linear_kernel(tfidf_vector)


In [322]:
cosine_similarities

array([[1.        , 0.0354408 , 0.02171873, 0.03081323, 0.0520659 ,
        0.07895018, 0.06078642, 0.10336703, 0.11482271, 0.00701532],
       [0.0354408 , 1.        , 0.0104621 , 0.04146731, 0.03825931,
        0.02695456, 0.10100862, 0.02313281, 0.03997218, 0.03789834],
       [0.02171873, 0.0104621 , 1.        , 0.01094544, 0.00848391,
        0.01015373, 0.01788719, 0.01056374, 0.01264582, 0.00419403],
       [0.03081323, 0.04146731, 0.01094544, 1.        , 0.10625026,
        0.05297779, 0.03311395, 0.01635189, 0.01584493, 0.00654924],
       [0.0520659 , 0.03825931, 0.00848391, 0.10625026, 1.        ,
        0.05615615, 0.05489794, 0.01689278, 0.02058365, 0.00827723],
       [0.07895018, 0.02695456, 0.01015373, 0.05297779, 0.05615615,
        1.        , 0.05745994, 0.04330723, 0.06196728, 0.01769252],
       [0.06078642, 0.10100862, 0.01788719, 0.03311395, 0.05489794,
        0.05745994, 1.        , 0.04325384, 0.06400919, 0.01938306],
       [0.10336703, 0.02313281, 0.0105637

In [323]:
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
related_docs_indices


array([[2, 3, 0, 4, 7, 8, 5, 6, 1, 9],
       [2, 9, 3, 4, 1, 5, 6, 0, 7, 8],
       [9, 2, 3, 4, 1, 6, 5, 0, 8, 7],
       [2, 9, 3, 7, 4, 5, 0, 8, 1, 6]], dtype=int64)

In [324]:
pd.DataFrame(cosine_similarities).to_csv('cosine_similarities.csv')

In [325]:
# compute the distance matrix
dist_mat = distance_matrix(list(cosine_similarities), list(cosine_similarities))
 
# display distance matrix
print("Distance Matrix:\n", dist_mat)

Distance Matrix:
 [[0.         1.37058199 1.39377917 1.37860186 1.34905396 1.30533722
  1.33243455 1.2697514  1.25326859 1.41404973]
 [1.37058199 0.         1.40342763 1.35982058 1.36331898 1.37810608
  1.27266303 1.38933552 1.36719415 1.36451333]
 [1.39377917 1.40342763 0.         1.40327303 1.40733462 1.40443264
  1.39557774 1.40883999 1.4082209  1.40865585]
 [1.37860186 1.35982058 1.40327303 0.         1.26433998 1.34250167
  1.37116534 1.40222446 1.40349461 1.40911693]
 [1.34905396 1.36331898 1.40733462 1.26433998 0.         1.33711603
  1.3411161  1.40012241 1.395649   1.40779041]
 [1.30533722 1.37810608 1.40443264 1.34250167 1.33711603 0.
  1.33530538 1.35705865 1.3322248  1.39449686]
 [1.33243455 1.27266303 1.39557774 1.37116534 1.3411161  1.33530538
  0.         1.35928744 1.33087061 1.39229202]
 [1.2697514  1.38933552 1.40883999 1.40222446 1.40012241 1.35705865
  1.35928744 0.         1.20619735 1.41228584]
 [1.25326859 1.36719415 1.4082209  1.40349461 1.395649   1.3322248
  1

In [326]:
pd.DataFrame(dist_mat)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,1.370582,1.393779,1.378602,1.349054,1.305337,1.332435,1.269751,1.253269,1.41405
1,1.370582,0.0,1.403428,1.359821,1.363319,1.378106,1.272663,1.389336,1.367194,1.364513
2,1.393779,1.403428,0.0,1.403273,1.407335,1.404433,1.395578,1.40884,1.408221,1.408656
3,1.378602,1.359821,1.403273,0.0,1.26434,1.342502,1.371165,1.402224,1.403495,1.409117
4,1.349054,1.363319,1.407335,1.26434,0.0,1.337116,1.341116,1.400122,1.395649,1.40779
5,1.305337,1.378106,1.404433,1.342502,1.337116,0.0,1.335305,1.357059,1.332225,1.394497
6,1.332435,1.272663,1.395578,1.371165,1.341116,1.335305,0.0,1.359287,1.330871,1.392292
7,1.269751,1.389336,1.40884,1.402224,1.400122,1.357059,1.359287,0.0,1.206197,1.412286
8,1.253269,1.367194,1.408221,1.403495,1.395649,1.332225,1.330871,1.206197,0.0,1.407447
9,1.41405,1.364513,1.408656,1.409117,1.40779,1.394497,1.392292,1.412286,1.407447,0.0


![Cosine Similarities between the theses documents](cosine_similarities_plot.png)