# Initialisation

In [1]:
from bs4 import BeautifulSoup 
s = """<!DOCTYPE lewis SYSTEM "lewis.dtd"> 
 <TEXT> 
 <TITLE>One</TITLE> 
 <BODY>Sample One</BODY> 
 </TEXT> 
 <TEXT> 
 <TITLE>Two</TITLE> 
 <BODY>Sample Two</BODY> 
 </TEXT>""" 
soup = BeautifulSoup(s,'html.parser') 
soup.find_all('body')

[<body>Sample One</body>, <body>Sample Two</body>]

In [2]:
import glob

In [3]:
print(glob.glob('*'))

['dic_expes_spams.json', 'Scikitlearn_Avance.ipynb', 'spamham.csv', '1001964.pdf', 'entraînement', 'V1_rapport', 'Methodo_projet.pdf', 'Notes.odt', 'AlexaneJOUGLAR_V1.zip', 'Test_Classification_amelioration.ipynb', 'tweets.ipynb', 'tweet_sample(2).ipynb', 'tweet_sample.ipynb', 'Spam_ham', 'TD_formatPDF', 'Reuters', 'Reuters.ipynb', 'Tweets(2).ipynb', 'Tweets', 'SpamHam.ipynb', 'report_classifier=perceptron_dataset=spam.txt', 'Notes_notions.odt']


In [4]:
f = open('Reuters/reuters21578/reut2-000.sgm')

In [5]:
lines = []
for line in open('Reuters/reuters21578/reut2-000.sgm', 'rb').readlines():
    line = line.decode('utf-8','ignore')
    lines.append(line)
xml_data = '\n'.join(lines)

In [6]:
import re
date_pattern = re.compile(r'[0-9]+-[A-Z]{3}-[0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]+')
date_pattern.findall('9-APR-1987 00:00:00.00    # date added by S Finch as guesswork')[0]

'9-APR-1987 00:00:00.00'

In [7]:
from lxml import etree
from lxml import objectify
from datetime import datetime


In [77]:
class ReutersSGMLParser():
    """A helper class for parsing Reuters-21578 XGML file formats"""
    def __init__(self):
        self.bad_char_pattern = re.compile(r"&#\d*;")
        self.document_pattern = re.compile(r"<REUTERS.*?<\/REUTERS>", re.S)
        self.date_pattern = re.compile(r'[0-9]+-[A-Z]{3}-[0-9]{4} *[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]+')

    def empty_row(self):
        """Get an empty rows which can be transformed into a dataframe"""
        rows = {
            'old_id'     : [],
            'new_id'     : [],
            'has_topics' : [],
            'date'       : [],
            'topics'     : [],
            'places'     : [],
            'people'     : [],
            'orgs'       : [],
            'exchanges'  : [],
            'companies'  : [],
            'title'      : [],
            'dateline'   : [],
            'body'       : [],
            'author'     : [],
            'cgi_split'  : [],
            'lewis_split': []
        }
        return rows

    def get_text(self, elem, tagname, d_tag = False):
        """Get the text of a tag or empty string"""
        txt = getattr(elem, tagname, '')
        if txt == '':
            return ''
        if d_tag:
            txt = txt.D
        txt = txt.text.strip()
        return txt

    def get_date(self, elem, tagname):
        """Get the datetime of a tag or empty string"""
        date_str = getattr(elem, tagname, '')
        if date_str == '':
            return ''
        date_str = date_str.text.strip()
        try:
            date_str = self.date_pattern.findall(date_str)[0]
        except IndexError as ie:
            print('Cannot find date patter in: %s' % date_str)
            return ''
        date = datetime.strptime(date_str, '%d-%b-%Y %H:%M:%S.%f')
        return date

    def parse_header(self, rows, doc):
        """parse the header.
        e.g. <REUTERS TOPICS="YES" LEWISSPLIT="TRAIN" CGISPLIT="TRAINING-SET" OLDID="5544" NEWID="1">"""
        items = dict(doc.items())
        rows[   'old_id'  ].append(items.get('OLDID', ''))
        rows[   'new_id'  ].append(items.get('NEWID', ''))
        rows[ 'has_topics'].append(bool(items.get('TOPICS', '')))
        rows[ 'cgi_split' ].append(items.get('CGISPLIT', ''))
        rows['lewis_split'].append(items.get('LEWISSPLIT', ''))

    def parse_string(self, str):
        # remove bad characters
        xml_data = self.bad_char_pattern.sub('', str)
        # find documents
        documents = self.document_pattern.findall(xml_data)
        # parse document's elements
        rows = self.empty_row()
        for doc in documents:
            xml_doc = objectify.fromstring(doc)
            # parse attributes of the header
            self.parse_header(rows, xml_doc)
            # read DATE
            rows[  'date'  ].append(self.get_date(xml_doc, 'DATE'))
            # read TOPICS
            rows[  'topics'  ].append(self.get_text(xml_doc,'TOPICS', True))
            # read PLACES
            rows[  'places'  ].append(self.get_text(xml_doc, 'PLACES', True))
            # read PEOPLE
            rows[ 'people'  ].append(self.get_text(xml_doc, 'PEOPLE', True))
            # read ORGS
            rows[ 'orgs'  ].append(self.get_text(xml_doc, 'ORGS', True))
            # read EXCHANGES
            rows[ 'exchanges'  ].append(self.get_text(xml_doc, 'EXCHANGES', True))
            # read COMPANIES
            rows[ 'companies'  ].append(self.get_text(xml_doc, 'COMPANIES', True))
            # read the TEXT tag
            text = xml_doc.TEXT
            rows[ 'title'  ].append(self.get_text(text, 'TITLE'))
            rows['dateline'].append(self.get_text(text, 'DATELINE'))
            rows[  'body'  ].append(self.get_text(text, 'BODY'))
            rows[  'author'  ].append(self.get_text(text, 'AUTHOR'))
        return rows

    def parse(self, path):
        """parse a file from the Reuters dataset
        """
        # open xml file
        xml_data = ''
        try:
            xml_data = open(path, 'r', encoding="utf-8").read()
        except UnicodeDecodeError as ude:
            print('Failed to read %s as utf-8' % path)
            lines = []
            for line in open(path, 'rb').readlines():
                line = line.decode('utf-8','ignore') #.encode("utf-8")
                lines.append(line)
            xml_data = '\n'.join(lines)
        return self.parse_string(xml_data)

In [78]:
import pandas as pd

In [79]:
parser = ReutersSGMLParser()
data = parser.empty_row()
for path in  ['Reuters/reuters21578/reut2-00.sgm']:
    # parse current document
    rows = parser.parse(path)
    # append rows into dataset
    for key in data.keys():
        data[key] = data[key] + rows[key]

df = pd.DataFrame(data, columns=data.keys())
#df = df.astype(dtype= {"date":"datetime64[]"})
df.head()

Unnamed: 0,old_id,new_id,has_topics,date,topics,places,people,orgs,exchanges,companies,title,dateline,body,author,cgi_split,lewis_split
0,5544,1,True,1987-02-26 15:01:01.790,cocoa,el-salvador,,,,,BAHIA COCOA REVIEW,"SALVADOR, Feb 26 -",Showers continued throughout the week in\nthe ...,,TRAINING-SET,TRAIN
1,5545,2,True,1987-02-26 15:02:20.000,,usa,,,,,STANDARD OIL <SRD> TO FORM FINANCIAL UNIT,"CLEVELAND, Feb 26 -",Standard Oil Co and BP North America\nInc said...,,TRAINING-SET,TRAIN
2,5546,3,True,1987-02-26 15:03:27.510,,usa,,,,,TEXAS COMMERCE BANCSHARES <TCB> FILES PLAN,"HOUSTON, Feb 26 -",Texas Commerce Bancshares Inc's Texas\nCommerc...,,TRAINING-SET,TRAIN
3,5547,4,True,1987-02-26 15:07:13.720,,usa,,,,,TALKING POINT/BANKAMERICA <BAC> EQUITY OFFER,"LOS ANGELES, Feb 26 -",BankAmerica Corp is not under\npressure to act...,"by Janie Gabbett, Reuters",TRAINING-SET,TRAIN
4,5548,5,True,1987-02-26 15:10:44.600,grain,usa,,,,,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE,"WASHINGTON, Feb 26 -",The U.S. Agriculture Department\nreported the ...,,TRAINING-SET,TRAIN


In [84]:
pays = []
for i in df.places:
    if i not in pays:
        pays.append(i)
print(pays)
print(len(pays))

['usa', 'italy', 'canada', 'uk', 'switzerland', 'kenya', 'zimbabwe', 'belgium', 'sweden', 'netherlands', 'ussr', 'south-africa', 'syria', 'venezuela', 'chile', 'japan', 'nigeria', '', 'denmark', 'uganda', 'finland', 'west-germany', 'colombia', 'peru', 'brazil', 'france', 'iran', 'singapore', 'greece', 'spain', 'dominican-republic', 'sri-lanka', 'luxembourg', 'jamaica', 'poland', 'argentina', 'turkey', 'austria', 'australia', 'taiwan', 'bolivia', 'suriname', 'ivory-coast', 'new-zealand', 'hong-kong', 'bahrain', 'china', 'south-korea', 'indonesia', 'thailand', 'pakistan', 'india', 'papua-new-guinea', 'uae', 'egypt', 'tanzania', 'zambia', 'iraq', 'morocco', 'cuba', 'mexico', 'costa-rica', 'ecuador', 'algeria', 'malaysia', 'bangladesh', 'philippines', 'saudi-arabia', 'angola', 'ghana', 'yemen-arab-republic', 'hungary', 'portugal', 'israel', 'jordan', 'bulgaria', 'norway', 'burma', 'haiti', 'aruba', 'yugoslavia', 'cyprus', 'madagascar', 'ethiopia', 'nicaragua', 'guyana', 'east-germany', 'ku

# Elargissement

In [52]:
Liste = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [81]:
parser = ReutersSGMLParser()
data = parser.empty_row()
for j in Liste :
    for path in  ['Reuters/reuters21578/reut2-0%i.sgm'%j]:
    # parse current document
        rows = parser.parse(path)
    # append rows into dataset
        for key in data.keys():
            data[key] = data[key] + rows[key]

df = pd.DataFrame(data, columns=data.keys())
#df = df.astype(dtype= {"date":"datetime64[]"})
df.head()

Failed to read Reuters/reuters21578/reut2-017.sgm as utf-8
Cannot find date patter in: 31-MAR-1987 605:12:19.12


Unnamed: 0,old_id,new_id,has_topics,date,topics,places,people,orgs,exchanges,companies,title,dateline,body,author,cgi_split,lewis_split
0,16321,1001,True,1987-03-03 09:18:21.260,,usa,,,,,SANDOZ PLANS WEEDKILLER JOINT VENTURE IN USSR,"BASLE, March 3 -",Sandoz AG said it planned a joint venture\nto ...,,TRAINING-SET,TRAIN
1,16322,1002,True,1987-03-03 09:19:31.960,,usa,,,,,TAIWAN REJECTS TEXTILE MAKERS EXCHANGE RATE PLEA,"TAIPEI, March 3 -",Central bank governor Chang Chi-cheng\nrejecte...,,TRAINING-SET,TRAIN
2,16323,1003,True,1987-03-03 09:20:23.320,earn,usa,,,,,NATIONAL FSI INC <NFSI> 4TH QTR LOSS,"DALLAS, March 3 -",Shr loss six cts vs profit 19 cts\n Net los...,,TRAINING-SET,TRAIN
3,16324,1004,True,1987-03-03 09:21:39.110,,usa,,,,,OCCIDENTAL <OXY> OFFICIAL RESIGNS,"LOMBARD, Ill., March 3 -","MidCon Corp, a subsidiary of\nOccidental Petro...",,TRAINING-SET,TRAIN
4,16325,1005,True,1987-03-03 09:25:48.880,,italy,,,,,ITALY'S BNL TO ISSUE 120 MLN DLR CONVERTIBLE BOND,"ROME, March 3 -",Italy's state-owned <Banca Nazionale del\nLavo...,,TRAINING-SET,TRAIN


In [54]:
df.body

0        Sandoz AG said it planned a joint venture\nto ...
1        Central bank governor Chang Chi-cheng\nrejecte...
2        Shr loss six cts vs profit 19 cts\n    Net los...
3        MidCon Corp, a subsidiary of\nOccidental Petro...
4        Italy's state-owned <Banca Nazionale del\nLavo...
                               ...                        
20573    The Japan/India-Pakistan-Gulf/Japan\nshipping ...
20574    The Soviet Union's industrial output is\ngrowi...
20575    Six black miners have been killed\nand two inj...
20576    The prospect of a dominant alliance of\nsocial...
20577    The American Stock Exchange said it has\nintro...
Name: body, Length: 20578, dtype: object

In [55]:
#il y a 20578 articles

In [56]:
df.places

0                 usa
1                 usa
2                 usa
3                 usa
4               italy
             ...     
20573       hong-kong
20574            ussr
20575    south-africa
20576     switzerland
20577             usa
Name: places, Length: 20578, dtype: object

In [85]:
pays = []
for i in df.places:
    if i not in pays:
        pays.append(i)
print(pays)
print(len(pays))

['usa', 'italy', 'canada', 'uk', 'switzerland', 'kenya', 'zimbabwe', 'belgium', 'sweden', 'netherlands', 'ussr', 'south-africa', 'syria', 'venezuela', 'chile', 'japan', 'nigeria', '', 'denmark', 'uganda', 'finland', 'west-germany', 'colombia', 'peru', 'brazil', 'france', 'iran', 'singapore', 'greece', 'spain', 'dominican-republic', 'sri-lanka', 'luxembourg', 'jamaica', 'poland', 'argentina', 'turkey', 'austria', 'australia', 'taiwan', 'bolivia', 'suriname', 'ivory-coast', 'new-zealand', 'hong-kong', 'bahrain', 'china', 'south-korea', 'indonesia', 'thailand', 'pakistan', 'india', 'papua-new-guinea', 'uae', 'egypt', 'tanzania', 'zambia', 'iraq', 'morocco', 'cuba', 'mexico', 'costa-rica', 'ecuador', 'algeria', 'malaysia', 'bangladesh', 'philippines', 'saudi-arabia', 'angola', 'ghana', 'yemen-arab-republic', 'hungary', 'portugal', 'israel', 'jordan', 'bulgaria', 'norway', 'burma', 'haiti', 'aruba', 'yugoslavia', 'cyprus', 'madagascar', 'ethiopia', 'nicaragua', 'guyana', 'east-germany', 'ku

In [61]:
lieu = []
for i in df.places:
    lieu.append(i)
#print(lieu)
#print(type(lieu))

In [62]:
df.body

0        Sandoz AG said it planned a joint venture\nto ...
1        Central bank governor Chang Chi-cheng\nrejecte...
2        Shr loss six cts vs profit 19 cts\n    Net los...
3        MidCon Corp, a subsidiary of\nOccidental Petro...
4        Italy's state-owned <Banca Nazionale del\nLavo...
                               ...                        
20573    The Japan/India-Pakistan-Gulf/Japan\nshipping ...
20574    The Soviet Union's industrial output is\ngrowi...
20575    Six black miners have been killed\nand two inj...
20576    The prospect of a dominant alliance of\nsocial...
20577    The American Stock Exchange said it has\nintro...
Name: body, Length: 20578, dtype: object

In [65]:
texte = []
for j in df.body:
    texte.append(j)
print(len(texte))
print(type(texte))

20578
<class 'list'>
