In [53]:
import json
import os
import bz2
import io
from bz2 import BZ2File
import re
import numpy as np
from datetime import date, timedelta


### Transform the data



The following command is here to throw useless datas in files. Since we already reduced the files, there is no need to do the compression again.

In [54]:
# ! for f in data/*[0-9].jsonl.bz2; do bzcat $f | jq -c '{id: .id, type: .tp, date: .d, title: .t, fulltext: .ft}' | bzip2 > "{f%.jsonl.bz2}-reduced.jsonl.bz2" ; done

Lists the files in the current folder:

In [55]:
ls -la data/

total 718544
drwxr-xr-x  15 aslam  staff       480  6 mar 00:06 [34m.[m[m/
drwxrwxrwx  33 aslam  staff      1056  6 mar 00:06 [30m[43m..[m[m/
-rwxrwxrwx   1 aslam  staff  23795907  3 mar 12:31 [31mJDG-1980-reduced.jsonl.bz2[m[m*
-rwxrwxrwx   1 aslam  staff  21521939  3 mar 12:32 [31mJDG-1981-reduced.jsonl.bz2[m[m*
-rwxrwxrwx   1 aslam  staff  22822888  3 mar 12:33 [31mJDG-1982-reduced.jsonl.bz2[m[m*
-rwxrwxrwx   1 aslam  staff  25263118  3 mar 12:34 [31mJDG-1983-reduced.jsonl.bz2[m[m*
-rwxrwxrwx   1 aslam  staff  23839962  3 mar 12:35 [31mJDG-1984-reduced.jsonl.bz2[m[m*
-rwxrwxrwx   1 aslam  staff  22984620  3 mar 12:36 [31mJDG-1985-reduced.jsonl.bz2[m[m*
-rwxrwxrwx   1 aslam  staff  33642024  3 mar 12:37 [31mJDG-1986-reduced.jsonl.bz2[m[m*
-rwxrwxrwx   1 aslam  staff  29476045  3 mar 12:38 [31mJDG-1987-reduced.jsonl.bz2[m[m*
-rwxrwxrwx   1 aslam  staff  33200405  3 mar 12:39 [31mJDG-1988-reduced.jsonl.bz2[m[m*
-rwxrwxrwx   1 aslam  staff  

### Reading newspaper archive data

Reminder: the data is already 'clean' and the files at hand contains only the following information:
- id
- date
- title
- type (article or advertisement)
- fulltext

Since our reduced files are in data/, we need to have the path to them:

In [56]:
input_dir = "data/" # update with your path 

In [57]:
# a helper function to get the lines from am archive
def read_jsonlines(bz2_file):
    text = bz2_file.read().decode('utf-8')
    for line in text.split('\n'):
        if line != '':
            yield line

In [58]:
# a helper function to see if we have all the keywords contained in a list
# returns true if all elements in arguments are positif
def all_pos(myList):
    return all(item > 0 for item in myList)

In [59]:
# this helper function prints the text description, and the occurences corresponding to the keywords
# This is only displaying content in the notebook 
# DEBUGGING PURPOSES (and for visualisation)
def print_arrays(text_description, keywords, occurences):
    myString = text_description + " // "
    length = len(keywords)
    
    for i in range(0, length):
        myString +=  keywords[i] + ": " + str(occurences[i]) + ", "
    
    print(myString)

The following function takes:
- article to study
- keywords (list of keywords)

and return:
- the number of occurences of these words (sum)

In [60]:
# this function deals with an article and return the occurences (sum for all the keywords given) --> or condition
# STAT PURPOSES
def study(article, keywords):
    json_article = json.loads(article)
    full_text_str = json_article["fulltext"].encode('ascii','ignore')
    
    flag = False # this flag will be True if at least one of the keyword is in the text
    
    length = len(keywords)
    full_occurences = [0] * length
    
    for index in range(0, length):
        occurences = len(re.findall(keywords[index], full_text_str, re.IGNORECASE))
    
        #title = json_article["id"].encode('ascii','ignore')

        if(occurences > 0):
            flag = True
            full_occurences[index] = occurences
            
    #if flag:
            #print_arrays(title, keywords, full_occurences)
            # Instead of print_arrays, we should call a function that will decide if all content are good or not
            # a similar function will return sum of list (one "or" condition, and the and the "and" condition will be checked with all_pos)
    return sum(full_occurences)



The following function takes:
    - dates (array of dates)
    - myKeys (list of list of keywords: list of and condition, and or on each element of the list)
    - andConditionVoc: which contains sum-up words for each element of myKeys


returns:
    - all the articles that verifies these conditions
    

In [61]:
# this function returns the articles that verifies three condition:
#   - include in dates (dates can be list of years or list of "YYYY-MM-DD" )
#   - myKeys (list of list of keywords: list of and condition, and or on each element of the list):
        # [["Gorbatchev", "Gorbatchov"], ["Russie", "URSS", "Moscou", "soviétique"], ["politique", "économie", "PCUS", "Perestroika", "Glasnot", "Glasnost"]]
        # ([x, y, z] --> x AND y AND z)
        # x = [x1, x2, x3] --> x1 OR x2 OR x3
#   - andConditionVoc, which contains sum-up words for each element of myKeys
        # ["gorbatchev", "russians", "politics"]
    
    
def getArticles(dates, myKeys, andConditionVoc):
    articles = []
    for archive in os.listdir(input_dir):
        print(archive)

        # take only the transformed archives
        if ("reduced" in archive):

            # open the archive
            f = BZ2File(os.path.join(input_dir, archive), 'r')

            # get the list of articles it contains (= a json object on each line)
            articles = list(read_jsonlines(f))
            # load the first 100 articles as json and access their attributes
            # print the size of each year
            # print("size: " + str(len(articles)))
            for a in articles:

                json_article = json.loads(a)
                # id of the article
                infos = json_article["id"].encode('ascii','ignore')

                occurencesPerDates = []
                
                for date in dates:
                    occurencesOneYear = [] * len(andConditionVoc)

                    if str(date) in infos:
                        occurences = []
                        for keywords in myKeys:
                            occurences.append(study(a, keywords))

                        occurencesOneYear += occurences
                        
                        if(all_pos(occurences)):
                            # the following line prints details if needed
                            # print_arrays(infos, andConditionVoc, occurences)
                            articles.append(a)
    return articles



Sample code for articles retrieving

In [11]:
# CODE TO OBTAINS ALL THE ARTICLES
# defining the keywords 
# ([x, y, z] --> x AND y AND z)
# x = [x1, x2, x3] --> x1 OR x2 OR x3
myKeys = [["Gorbatchev", "Gorbatchov"], ["Russie", "URSS", "Moscou", "soviétique"], ["politique", "économie", "PCUS", "Perestroika", "Glasnot", "Glasnost"]]
# to be able to print with a single-word a bunch of words
andConditionVoc = ["gorbatchev", "russians", "politics"]

dates = range(1981, 1990)

# get the articles
print(len(getArticles(dates, myKeys, andConditionVoc)))

# TODO: save them into a file compatible for IRAMUTEQ

JDG-1987-reduced.jsonl.bz2


KeyboardInterrupt: 

In [62]:
# STAT PURPOSE
# this function takes dates in input and a list of keywords
# it will return the occurences depending the dates and the keywords
# optimization: instead of checking the dates on each articles, we may prefer to just look at the archive year. 
# But in our case, we also want to have specific dates, so we must iterate over the articles
def read_data(dates, keywords):
    results = np.asarray([[0] * len(keywords)] * len(dates))
    
    for archive in os.listdir(input_dir):
        print(archive)
        # take only the transformed archives
        # TODO: put "reduced" instead of "1987"
        if ("reduced" in archive):

            # open the archive
            f = BZ2File(os.path.join(input_dir, archive), 'r')
            # get the list of articles it contains (= a json object on each line)
            articles = list(read_jsonlines(f))

            for a in articles:

                json_article = json.loads(a)
                # id of the article
                infos = json_article["id"].encode('ascii','ignore')

                for date_index in range(0, len(dates) ): # we need the date_index (date itself not enough)
                    if str(dates[date_index]) in infos:
                        occurences = np.asarray([0] * len(keywords))
                        for keyword_index in range(0, len(keywords)):
                            occurences[keyword_index] += study(a, keywords[keyword_index])
                            
                        results[date_index] += occurences

    return results



In [63]:
# USAGE

keywords = ["gorbatchev", "russes", "politics"]

# dates = range(1981, 1990)
dates = ["1987-12-12", "1981-01-01"]

# print(read_data(dates, keywords))

# return vector type = [[x1, y1, z1], [x2, y2, z2]]
# 1 (or 2) --> date // x, y, z --> keywords




In [64]:
# Corpus
# This function is mostly coming from Group H (Secret Bancaire)
def export(filename, list_articles):
    return "TODO"

In [65]:
# myKeys contain list of list of keywords ( [["Gorbatchev", "Gorbatchov"], ["Russie", "URSS", "Moscou", "soviétique"], ["politique", "économie", "PCUS", "Perestroika", "Glasnot", "Glasnost"]])
# STAT PURPOSES (Main function)
# this will do a sum on each bunch of keywords (like OR), and if all_pos results --> accepted text

def getOccurences(dates, myKeys):
    result = np.asarray([[0] * len(dates)] * len(myKeys))

    for key_index in range(0, len(myKeys)):
        keywords = myKeys[key_index]
        occurences_per_year = [sum(x) for x in read_data(dates, keywords)]
        for date_index in range(0, len(dates)):
            result[key_index][date_index] = occurences_per_year[date_index]
        # print(result)
    
    return result
    



In [66]:
#gives in string all the dates between two dates
def string_dates(date1, date2):
    result = []
    delta = date2 - date1
    
    for i in range(delta.days + 1):
        result.append(str(d1+timedelta(i)))
        
    return result

In [67]:
def writeFile(variableName, variable):
    f = open("variables.txt", "a+")
    f.write("\n" + variableName + "=" + str(variable))
    f.close()

sample_result = [[0,0,0,0,44741811,48535429,77840706,55341914,35041383],
                 [0,0,0,0,81038398,87802262,140905671,100246182,63635214],
                 [0,0,0,0,144729853,157289618, 251367646,178893618,113408514]]
# writeFile("lol", sample_result)




## Execution Statistiques

In [68]:
# defining the keywords 
# ([x, y, z] --> x AND y AND z)
# x = [x1, x2, x3] --> x1 OR x2 OR x3
myKeys = [["Gorbatchev", "Gorbatchov"], ["Russie", "URSS", "Moscou", "soviétique"], ["politique", "économie", "PCUS", "Perestroika", "Glasnot", "Glasnost"]]
# to be able to print with a single-word a bunch of words
andConditionVoc = ["gorbatchev", "russians", "politique"]

dates = range(1981, 1990)
# dates = ["1987-12-12", "1988"]


corpus1_occurences = getOccurences(dates, myKeys)
writeFile("corpus1_occurences", corpus1_occurences)
print(corpus1_occurences)
print("corpus 1")

JDG-1982-reduced.jsonl.bz2
JDG-1987-reduced.jsonl.bz2
JDG-1983-reduced.jsonl.bz2
JDG-1986-reduced.jsonl.bz2
JDG-1980-reduced.jsonl.bz2
JDG-1985-reduced.jsonl.bz2
JDG-1981-reduced.jsonl.bz2
JDG-1984-reduced.jsonl.bz2
JDG-1991-reduced.jsonl.bz2
JDG-1988-reduced.jsonl.bz2
JDG-1990-reduced.jsonl.bz2
JDG-1989-reduced.jsonl.bz2
JDG-1992-reduced.jsonl.bz2
JDG-1982-reduced.jsonl.bz2
JDG-1987-reduced.jsonl.bz2
JDG-1983-reduced.jsonl.bz2
JDG-1986-reduced.jsonl.bz2
JDG-1980-reduced.jsonl.bz2
JDG-1985-reduced.jsonl.bz2
JDG-1981-reduced.jsonl.bz2
JDG-1984-reduced.jsonl.bz2
JDG-1991-reduced.jsonl.bz2
JDG-1988-reduced.jsonl.bz2
JDG-1990-reduced.jsonl.bz2
JDG-1989-reduced.jsonl.bz2
JDG-1992-reduced.jsonl.bz2
JDG-1982-reduced.jsonl.bz2
JDG-1987-reduced.jsonl.bz2
JDG-1983-reduced.jsonl.bz2
JDG-1986-reduced.jsonl.bz2
JDG-1980-reduced.jsonl.bz2
JDG-1985-reduced.jsonl.bz2
JDG-1981-reduced.jsonl.bz2
JDG-1984-reduced.jsonl.bz2
JDG-1991-reduced.jsonl.bz2
JDG-1988-reduced.jsonl.bz2
JDG-1990-reduced.jsonl.bz2
J

In [None]:

myKeys = [["Reagan"], ["Etats-Unis", "USA", "Amerique", "Washington"], ["politique", "économie", "pershing"]]
# to be able to print with a single-word a bunch of words
andConditionVoc = ["Reagan", "USA", "politique"]

dates = range(1981, 1992)
# dates = ["1987-12-12", "1988"]

corpus2_occurences = getOccurences(dates, myKeys)
writeFile("corpus2_occurences", corpus2_occurences)
print(corpus2_occurences)
print("corpus 2")

JDG-1982-reduced.jsonl.bz2
JDG-1987-reduced.jsonl.bz2
JDG-1983-reduced.jsonl.bz2
JDG-1986-reduced.jsonl.bz2
JDG-1980-reduced.jsonl.bz2
JDG-1985-reduced.jsonl.bz2
JDG-1981-reduced.jsonl.bz2
JDG-1984-reduced.jsonl.bz2


In [24]:
myKeys = [["Sommet", "Geneve"], ["Reagan", "Gorbatchev", "Gorbatchov", "Moscou", "Washington"]]
# to be able to print with a single-word a bunch of words
andConditionVoc = ["SommetGeneve", "voc"]

d1 = date(1985, 11, 13)
d2 = date(1985, 11, 25)
# dates = ["1985-11-13", "1985-11-25"]
dates = string_dates(d1, d2)


corpus3_occurences = getOccurences(dates, myKeys)
writeFile("corpus3_occurences", corpus3_occurences)
print(corpus3_occurences)
print("corpus 3")


JDG-1987-reduced.jsonl.bz2
GDL-1987-reduced.jsonl.bz2


KeyboardInterrupt: 

In [26]:
myKeys = [["Sommet", "Washington"], ["Reagan", "Gorbatchev", "Gorbatchov", "Moscou", "Washington"]]
# to be able to print with a single-word a bunch of words
andConditionVoc = ["SommetWashington", "voc"]

d1 = date(1987, 12, 02)
d2 = date(1987, 12, 14)
# dates = ["1987-12-02", "1987-12-14"]
dates = string_dates(d1, d2)

corpus4_occurences = getOccurences(dates, myKeys)
writeFile("corpus4_occurences", corpus4_occurences)
print(corpus4_occurences)
print("corpus 4")

corpus 4


In [27]:
myKeys = [["Gorbatchev", "Gorbatchov"], ["Russie", "URSS", "Moscou", "soviétique"], ["politique", "économie", "PCUS", "Perestroika", "Glasnot", "Glasnost"],
          ["Reagan"], ["Etats-Unis", "USA", "Amerique", "Washington"]
         ]

corpus5_occurences = getOccurences(dates, myKeys)
writeFile("corpus5_occurences", corpus5_occurences)
print("corpus 5")



corpus 5


## Plot

In [33]:
# Ploting result with a Trial with a given result
sample_result = [[0,0,0,0,44741811,48535429,77840706,55341914,35041383],
                 [0,0,0,0,81038398,87802262,140905671,100246182,63635214],
                 [0,0,0,0,144729853,157289618, 251367646,178893618,113408514]]


dates = range(1981, 1990)
dates = [1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989]

andConditionVoc = ["gorbatchev", "russians", "politics"]

# pd.DataFrame(dates, sample_result)

pd.DataFrame({'gorbatchev': sample_result[0], 'russians': sample_result[1], 'politics': sample_result[2]}, index=dates)

pd.DataFrame({andConditionVoc[0]: sample_result[0], andConditionVoc[1]: sample_result[1], andConditionVoc[2]: sample_result[2]}, index=dates)


df.plot(kind="bar").legend()




NameError: name 'pd' is not defined