# Desirialization of the RDF graph from Le Temps database

This code aims at creating pulses for the ClioWire platform.

##### Note:
Because of the confidential nature of the data, all of the items from the dataset are stored locally. 

In [23]:
#Importing the necessary librairies
import pandas as pd
import numpy as np
import glob
import re

In [81]:
#Getting all the CSV, obtained from SPARQL queries.
allFiles = glob.glob("Data/functions/*.csv")

In [82]:
display(allFiles)

['Data/functions\\ecrivain.csv']

## Sorting the SPARQL outputs into a well strctured DataFrame

In [85]:
#Initiating the DataFrame
Table = pd.DataFrame(columns=['issue', 'date', 'artid', 'page', 'name', 'function', 'nationality'])
K = 0
while (K < int(len(allFiles))):
    #Creating a DataFrame and filling it with the excel's data
    title = allFiles[K][15:].replace('.csv', '')
    df = pd.read_csv(allFiles[K])
    df['function'] = title
    Table = Table.append(df)
    K+=1  



#Extracting issue date
Table['issue'] = Table['issue'] + Table.date.str[:10]
Table['date'] = Table.date.str[:4]

#Cleaning the issue date
Table['issue'] = Table['issue'].str.replace('-', '_')

#Cutting the title in order not to excess the maximum size of pulses
Table['title'] = Table.title.str[:100]

#Sorting values
Table = Table.sort_values('date', ascending=False)

#Keeping only alphabetical characters to be able to use the title as a hastaga
Table['title'] = Table.title.str.replace('\W', '')

#Rearranging the page number by removing periods
Table['page'] = Table['page'].astype(str).str[:2]
Table['page'] = Table['page'].str.replace('.', '').apply(str)

#Converting the articleID into a string
Table['artid'] = Table['artid'].apply(str)

#Dropping dupiplicates of names
Table = Table.drop_duplicates('name')


In [86]:
display(Table.head())

Unnamed: 0,artid,date,function,issue,name,nationality,page,title
30622,3191430,1998,ecrivain,GDL1998_02_28,Michel Serres,français,36,Voslettres
22383,3186293,1998,ecrivain,GDL1998_02_27,Bohumil Hrabal,tchèque,38,Lagendadesspectaclesetconcertsdulundi9audimanc...
6136,849344,1998,ecrivain,JDG1998_02_24,Rachid Boudjedra,algérien,27,PourdéfendreLaPéricholedOffenbachheureusementq...
4345,4209929,1998,ecrivain,JDG1998_01_19,GUnter Grass,allemand,15,Beauxarts
26481,3106148,1998,ecrivain,GDL1998_02_18,Ernst Jiinger,allemand,27,Après102ansentregloireetpolémiqueslécrivainErn...


In [None]:
#Using the function to get the outputs
#Extract_function(allFiles).to_csv('csv_function_out', index=False)

# Extracting the information for pulses
In order to search within the set of pulses, we must disambiguate the functions that are 

Model:
"Adam Smith était économiste en 1780"

And we will cut it into:

"#AdamSmith #in #économiste #when #1789"

## 1.1. Mention treatment
In this part we will use the SPARQL outputs in order to make #mention pulses. The parentheses are showing that it is a generic parameter, meaning that the pulse will contain the information that is inside it. The pulses will be the following:


"#Mention #(Name) #in #(title_articleID_pagenumber)"

In [59]:
def MentionTreatment(Input):
    Input['Pulse'] = '#mention #' + Input['name'].str.replace('\W', '') + ' #in #' + Input['title'] + '_articleID' + Input['artid'] + '_page' + Input['page']
    return Input['Pulse']

In [61]:
#Applying the function and exporting the results as a CSV
MentionTreatment(Table).to_csv('mention_treated.csv', index=False)

## 1.2. Source treatment 
The format is:

"#in #(titre)_(artid)_(page) #in #(titre)_(artid) #in #(issue)  " 

In [108]:
def SourceTreatment(Input):
    Input['Pulse'] = '#in #' + Input['title'] + '_articleID' + Input['artid'] + '_page' + Input['page'] + ' #in #' + Input['title'] + '_articleID' + Input['page'] + ' #in #' + Input['issue']
    return Input['Pulse']

In [71]:
#Applying the function and exporting the results as a CSV
SourceTreatment(Table).to_csv('source_treated.csv', index=False)

## 1.3. Person's function
The format is:

"#(person) #in #(function) #where #(date)"

In [79]:
def PersonFunction(Input):
    Input['Pulse'] = '#' + Input['name'].str.replace('\W', '') + ' #in #' + Input['function'] + ' #when #' + Input['date']
    return Input['Pulse']

In [80]:
#Applying the function and exporting the results as a CSV
PersonFunction(Table).to_csv('person_function.csv', index=False)

## 1.4. Person's nationality
The format is: 

"#(person) #in #(nationality)"

In [75]:
def PersonNationality(Input):
    Input['Pulse'] = '#' + Input['name'].str.replace('\W', '') + '#in #' + Input['nationality']
    return Input['Pulse']

In [76]:
#Applying the function and exporting the results as a CSV
PersonNationality(Table).to_csv('person_nationality.csv', index=False)

## 2. Dealing with locations
##### Data wrangling
We first deal with the SPARQL outputs and then put it in a format that is more convenient to manipulate.

In [112]:
LocTable = pd.read_csv('Data/Locations.csv')

#Renaming columns
LocTable.columns = ['location', 'issue', 'date', 'title', 'artid', 'page' , 'year']

#Reshaping the issue
LocTable['issue'] = LocTable['issue'] + LocTable.date.str[:10]
LocTable['date'] = LocTable.date.str[:4]
LocTable['issue'] = LocTable['issue'].str.replace('-', '_')

#Converting the articleID and page number
LocTable['page'] = LocTable['page'].astype(str)
LocTable['artid'] = LocTable['artid'].apply(str)

#Keeping only alphabetical characters to be able to use the title as a hastags
LocTable['title'] = LocTable.title.str.replace('\W', '')

display(LocTable.head())

Unnamed: 0,location,issue,date,title,artid,page,year
0,Autriche,JDG1868_01_01,1868,UntitledArticle,2971954,2,1868
1,Autriche,JDG1882_01_01,1882,i,2796744,1,1882
2,Brésil,JDG1889_01_01,1889,UntitledArticle,5391187,4,1889
3,Marseille,JDG1936_01_01,1936,UntitledArticle,2279377,9,1936
4,Italie,JDG1950_01_01,1950,UntitledArticle,823799,14,1950


##### Pulse creation
In this part, we will create pulses indicating places that were mentioned in one article. The format is the following:

"#mention #(lieu) #in #(title_articleID_page)"

In [103]:
def LocationExtraction(Input):
    Input['Pulse'] = '#mention #' + Input['location'] + ' #in #' + Input['title'] + '_articleID' + Input['artid'] + '_page' + Input['page'] 
    return Input['Pulse']

In [104]:
#Applying the function and exporting the results as a CSV
LocationExtraction(LocTable).to_csv('Location_pulse.csv', index=False)

In [113]:
#Applying source function and exporting the results as a CSV
SourceTreatment(LocTable).to_csv('Location_Source.csv', index=False)

# 3. Copresence

Identifying the presence of two persons in the same article.

The format is:

"#copresence #(name)

In [118]:
CoFiles = glob.glob("Data/Copresence/*.csv")
display(CoFiles)

['Data/Copresence\\copres_violoniste.csv']

In [134]:
CoTable = pd.DataFrame(columns=['issue', 'date','title' , 'artid', 'page', 'name', 'function', 'nationality', 'name1', 'function1'])
K = 0
while (K < int(len(CoFiles))):
    #Creating a DataFrame and filling it with the excel's data
    df = pd.read_csv(CoFiles[K])
    CoTable = CoTable.append(df)
    K+=1  

#Dropping unecessary columns
CoTable = CoTable.drop(['function', 'nationality', 'function1'], axis=1)

#Dropping self matching
CoTable = CoTable[CoTable['name'] != CoTable['name1']]

#Dropping duplicates
CoTable = CoTable.drop_duplicates('name')

#Converting the articleID and page number
CoTable['page'] = CoTable['page'].astype(str)
CoTable['artid'] = CoTable['artid'].apply(str)

#Keeping only alphabetical characters to be able to use the title as a hastaga
CoTable['title'] = CoTable.title.str.replace('\W', '')


display(CoTable.head())

Unnamed: 0,issue,date,title,artid,page,name,name1
1,JDG,1945-11-19T05:00:00,esévénementsdetasemaine,5037612,5,Jacques Tliibaud,Victoria Hall
7,JDG,1955-09-24T05:00:00,ANYONRécitalGezaAnda,1259063,12,Dayy Erlih,Geza Anda
9,JDG,1945-02-26T05:00:00,esévénementsdelasemaine,5049696,3,Ginette Neveu,M. Eric Schmidt
16,JDG,1952-05-17T05:00:00,AuTribunaldeDivision5Extravagancesdunchefdesca...,2540038,2,Anja Ignatius,Antoine Buehler
18,JDG,1950-04-19T05:00:00,UntitledArticle,353078,2,Claude Paschoud,Rainier III


In [140]:
def CopresenceExtraction(Input):
    Input['Pulse'] = '#copresence #' + Input['name'].str.replace('\W', '') + ' #' + Input['name1'] + ' #in #'  + Input['title'] + '_articleID' + Input['artid'] + '_page' + Input['page']
    return Input['Pulse']

In [142]:
#Applying the function and exporting the results as a CSV
CopresenceExtraction(CoTable).to_csv('Copresence_pulse.csv', index=False)