# Desirialization of the RDF graph from Le Temps database

This code aims at creating pulses for the ClioWire platform.

##### Note:
Because of the confidential nature of the data, all of the items from the dataset are stored locally. 

In [12]:
#Importing the necessary librairies
import pandas as pd
import numpy as np
import glob
import re

In [2]:
#Getting all the CSV, obtained from SPARQL queries.
allFiles = glob.glob("Data/*.csv")

In [3]:
display(allFiles)

['Data\\ecrivain.csv']

## Sorting the SPARQL outputs into a well strctured DataFrame

In [82]:
#Initiating the DataFrame
Table = pd.DataFrame(columns=['issue', 'date', 'function', 'name', 'nationality', 'title', 'titleID'])
K = 0
while (K < int(len(allFiles))):
    #Creating a DataFrame and filling it with the excel's data
    title = allFiles[K][5:].replace('.csv', '')
    df = pd.read_csv(allFiles[K])
    df['function'] = title
    df['titleID'] = '666'
    Table = Table.append(df)
    K+=1  



#Extracting issue date
Table['issue'] = Table['issue'] + Table.date.str[:10]
Table['date'] = Table.date.str[:4]

#Sorting values
Table = Table.sort_values('date', ascending=False)

#Keeping only alphabetical characters to be able to use the title as a hastaga
Table['title'] = Table.title.str.replace('\W', '')

#Rearranging the page number by removing periods
Table['page'] = Table['page'].astype(str).str[:2]
Table['page'] = Table['page'].str.replace('.', '')

In [84]:
display(Table.head())

Unnamed: 0,date,function,issue,name,nationality,page,title,titleID
2438,1998,ecrivain,GDL1998-02-21,Ernst JUnger,allemand,39,LucBondyrevientauThéâtredeVidy,666
832,1998,ecrivain,JDG1998-02-21,Ernst JUnger,allemand,39,LucBondyrevientauThéâtredeVidy,666
2588,1998,ecrivain,GDL1998-02-18,Ernst Jiinger,allemand,27,Après102ansentregloireetpolémiqueslécrivainErn...,666
49,1998,ecrivain,JDG1998-02-26,Madeleine Santschi,suisse,15,LacadémicienMichelSerresobtientgaindecausecont...,666
2879,1998,ecrivain,GDL1998-02-24,Rachid Boudjedra,algérien,27,PourdéfendreLaPéricholedOffenbachheureusementq...,666


In [None]:
#Using the function to get the outputs
Extract_function(allFiles).to_csv('csv_function_out', index=False)

### Extraction information of pulses
In order to search within the set of pulses, we must disambiguate the functions that are 

Model:
"Adam Smith était économiste en 1780"

And we will cut it into:
#AdamSmith #in #économiste #when #1789

In [None]:
Pulse = pd.Series(Table['name'] + ' était ' + Table['function'] + ' en ' + Table['date'] + ' #' + Table['name'].str.replace(' ', '').str.replace('-', '').str.replace('.', '') + ' #' + Table['function'].str.replace(' ', '') + ' #' + Table['issue']).str.replace('-', '')
display)

## 1.1. Mention treatment
In this part we will use the SPARQL outputs in order to make #mention pulses. The parentheses are showing that it is a generic parameter, meaning that the pulse will contain the information that is inside it. The pulses will be the following:


#Mention #(Name) #in #(title_articleID_pagenumber)

In [93]:
def MentionTreatment(Input):
    Input['Pulse'] = '#mention #' + Input['name'].str.replace('\W', '') + ' #in #' + Input['title'] + '_' + Input['titleID'] + '_' + Input['page']
    return Input['Pulse']

In [94]:
#Applying the function and exporting the results as a CSV
#MentionTreatment(Table).to_csv('mention_treated', index=False)

## 1.2. Source treatment 
The format is:

#in #(titre) #

In [95]:
def SourceTreatment(Input):
    Input['Pulse'] = '#in #' + Input['title'] + ' #' + Input['issue']
    return Input['Pulse']

In [96]:
#Applying the function and exporting the results as a CSV
#SourceTreatment(Table).to_csv('source_treated', index=False)

## 1.3. Person's function
The format is:

#(person) #in #(function) #where #(date)

In [101]:
def PersonFunction(Input):
    Input['Pulse'] = '#' + Input['name'].str.replace('\W', '') + '#in #' + Input['function'] + '#when #' + Input['date']
    return Input['Pulse']

In [102]:
#Applying the function and exporting the results as a CSV
#PersonFunction(Table).to_csv('person_function', index=False)

## 1.4. Person's nationality
The format is: 

#(person) #in #(nationality)

In [103]:
def PersonNationality(Input):
    Input['Pulse'] = '#' + Input['name'].str.replace('\W', '') + '#in #' + Input['nationality']
    return Input['Pulse']

In [104]:
#Applying the function and exporting the results as a CSV
#PersonNationality(Table).to_csv('person_nationality', index=False)

## 2. Dealing with locations
##### Data wrangling
We first deal with the SPARQL outputs and then put it in a format that is more convenient to manipulate.

In [None]:
LocTable = pd.DataFrame(columns=['location', 'title', 'titleID'])

##### Pulse creation
In this part, we will create pulses indicating places that were mentioned in one article. The format is the following:

#mention #(lieu) #(title_articleID)

In [105]:
def LocationExtraction(Input):
    Input['Pulse'] = '#mention #' + Input['location'] + ' #' + Input['title'] + '_' + Input['titleID']
    return Input['Pulse']

In [None]:
#Applying the function and exporting the results as a CSV
#LocationExtraction(Table).to_csv('Location', index=False)