In [19]:
import os
import csv
import pandas as pd

In [20]:
folder = './pages/'

fn_list = []
dn_list = []
d_revisions = []

In [21]:
# Obtain full list of directories from the old site
for (dirpath, dirnames, filenames) in os.walk(folder):
    for d in dirnames:
        dn = os.path.join(dirpath, d)
        dn_list.append(dn)
dn_list.sort()

In [22]:
# If 'revisions' is in a directory, get the last revision file with dir path
for d in dn_list:
    if "revisions" in d:
        temp_list = []
        for (dirpath, dirnames, filenames) in os.walk(d):
            for f in filenames:
                fn = os.path.join(dirpath, f)
                temp_list.append(fn)
        temp_list.sort()
        fn_list.append(temp_list[-1])
fn_list.sort()

In [23]:
# Create a table with all the relevant data from the last revision file
content_table = []
for fn in fn_list:
    (dirname, filename) = os.path.split(fn)
    open_file = open(fn, 'r')
    file_text = open_file.read()
    table_row = [dirname, file_text]
    content_table.append(table_row)

In [24]:
# Convert table to CSV
# with open("output.csv", "w", newline="") as output:
#     writer = csv.writer(output)
#     writer.writerows(content_table)

# Note if needed: 
# os.path.dirname(os.path.dirname(fn))  # directory of directory of file

In [25]:
# Create a Pandas dataframe
content_df = pd.DataFrame.from_records(content_table)
content_df.columns = ["file_path", "content"]
display(content_df)

Unnamed: 0,file_path,content
0,./pages/2019(2d)02(2d)21(2d)initial(2d)jersey(...,= 2019-02-21 Initial Jersey Meetup =\n\n17:30 ...
1,./pages/2019(2d)03(2d)26(2d)march(2d)jersey(2d...,= 2019-03-23 March Jersey Meetup =\n\n17:30 Di...
2,./pages/2019(2d)04(2d)24(2d)april(2d)jersey(2d...,## page was renamed from 2019-04-24-march-jers...
3,./pages/2019(2d)04(2d)24(2d)march(2d)jersey(2d...,Page moved to correct URL: \n\nhttps://cipug.o...
4,./pages/2019(2d)05(2d)may(2d)jersey(2d)meetup\...,= 2019-05-23 March Jersey Meetup =\n\n17:30 - ...
5,./pages/2019(2d)06(2d)june(2d)jersey(2d)meetup...,= 2019-06-26 June Jersey Meetup =\n\n17:30 - U...
6,./pages/2019(2d)07(2d)july(2d)jersey(2d)meetup...,= 2019-07 July Jersey Meetup =\n\n17:30 - Unit...
7,./pages/2019(2d)08(2d)august(2d)jersey(2d)meet...,= 2019-08-29 August Jersey Meetup =\n\n17:30 -...
8,./pages/2019(2d)09(2d)september(2d)jersey(2d)m...,= 2019-09-?? September Jersey Meetup =\n\n17:3...
9,./pages/2019(2d)10(2d)october(2d)jersey(2d)mee...,= 2019-10-24 October Jersey Meetup =\n\n17:30 ...


### Interpreting the data.

Row index 0:10 are the notes from the 10 meetings to date. The Content column data is marked up in the format from the old site. We can parse through this data to reformat into the new site. For example '/n' for new line can be replaced with the equivelant in html (<br />) or (<p> <p/>)if this is how it is rendered in the new site. The headers wrapped in '=' could be replaced with header tags.

We will be able to drop the following rows [10, 13, 16] as the content is only relevant to the old site (@rcwd to confirm).  

In [26]:
content_df = content_df.drop(content_df.index[[3,10,13,16]])
display(content_df)

Unnamed: 0,file_path,content
0,./pages/2019(2d)02(2d)21(2d)initial(2d)jersey(...,= 2019-02-21 Initial Jersey Meetup =\n\n17:30 ...
1,./pages/2019(2d)03(2d)26(2d)march(2d)jersey(2d...,= 2019-03-23 March Jersey Meetup =\n\n17:30 Di...
2,./pages/2019(2d)04(2d)24(2d)april(2d)jersey(2d...,## page was renamed from 2019-04-24-march-jers...
4,./pages/2019(2d)05(2d)may(2d)jersey(2d)meetup\...,= 2019-05-23 March Jersey Meetup =\n\n17:30 - ...
5,./pages/2019(2d)06(2d)june(2d)jersey(2d)meetup...,= 2019-06-26 June Jersey Meetup =\n\n17:30 - U...
6,./pages/2019(2d)07(2d)july(2d)jersey(2d)meetup...,= 2019-07 July Jersey Meetup =\n\n17:30 - Unit...
7,./pages/2019(2d)08(2d)august(2d)jersey(2d)meet...,= 2019-08-29 August Jersey Meetup =\n\n17:30 -...
8,./pages/2019(2d)09(2d)september(2d)jersey(2d)m...,= 2019-09-?? September Jersey Meetup =\n\n17:3...
9,./pages/2019(2d)10(2d)october(2d)jersey(2d)mee...,= 2019-10-24 October Jersey Meetup =\n\n17:30 ...
11,./pages/Guernsey\revisions,= Guernsey =\n\nNothing here yet. \n\nIf you'r...


In [27]:
# Add in the columns for the new model
content_df.insert(loc=2, column = "date", value = "")
content_df.insert(loc=3, column = "place", value = "")
content_df.insert(loc=4, column = "agenda", value = "")
content_df.insert(loc=5, column = "notes", value = "")
content_df.insert(loc=6, column = "resources", value = "")
content_df.insert(loc=7, column = "attended", value = "")

display(content_df)

Unnamed: 0,file_path,content,date,place,agenda,notes,resources,attended
0,./pages/2019(2d)02(2d)21(2d)initial(2d)jersey(...,= 2019-02-21 Initial Jersey Meetup =\n\n17:30 ...,,,,,,
1,./pages/2019(2d)03(2d)26(2d)march(2d)jersey(2d...,= 2019-03-23 March Jersey Meetup =\n\n17:30 Di...,,,,,,
2,./pages/2019(2d)04(2d)24(2d)april(2d)jersey(2d...,## page was renamed from 2019-04-24-march-jers...,,,,,,
4,./pages/2019(2d)05(2d)may(2d)jersey(2d)meetup\...,= 2019-05-23 March Jersey Meetup =\n\n17:30 - ...,,,,,,
5,./pages/2019(2d)06(2d)june(2d)jersey(2d)meetup...,= 2019-06-26 June Jersey Meetup =\n\n17:30 - U...,,,,,,
6,./pages/2019(2d)07(2d)july(2d)jersey(2d)meetup...,= 2019-07 July Jersey Meetup =\n\n17:30 - Unit...,,,,,,
7,./pages/2019(2d)08(2d)august(2d)jersey(2d)meet...,= 2019-08-29 August Jersey Meetup =\n\n17:30 -...,,,,,,
8,./pages/2019(2d)09(2d)september(2d)jersey(2d)m...,= 2019-09-?? September Jersey Meetup =\n\n17:3...,,,,,,
9,./pages/2019(2d)10(2d)october(2d)jersey(2d)mee...,= 2019-10-24 October Jersey Meetup =\n\n17:30 ...,,,,,,
11,./pages/Guernsey\revisions,= Guernsey =\n\nNothing here yet. \n\nIf you'r...,,,,,,


In [28]:
# pip install markdown2

In [29]:
test_ml = content_df.iloc[2]['content']

In [30]:
print(test_ml)

## page was renamed from 2019-04-24-march-jersey-meetup
= 2019-04-24 March Jersey Meetup =

17:30 - Unit 13, Le Capelain House, Castle Quay

== Notes ==

11 attendees! We're cooking on gas now!

Huge thanks to CIPUG founding member Phil for providing space for this month's meet up.

General intros for new attendees.

Adrian demoed his initial version of his personal financial tracking project.

Rob demoed the Adafruit Pyportal 

== Resources ==

 * [[https://anvil.works/|Anvil Drag & Drop Python]]
 * [[https://pypi.org/project/cooked-input/|Cooked Input]]
 * [[https://getbootstrap.com/|Bootstrap]]
 * [[https://getuikit.com/|UIKit]]
 * [[https://www.polymer-project.org/|Polymer]]
 * [[https://webflow.com/|Webflow]]
 * [[https://blog.getpelican.com/|Pelican Static Site Generator]]
 * [[https://www.adafruit.com/micropython|MicroPython on Adafruit]]


=== Not covered (held till next month?) ===

 * Docker
 * Neural Net Course Review
 * How to contribute to Open Source



In [31]:
from markdown2 import Markdown
markdowner = Markdown()

ModuleNotFoundError: No module named 'markdown2'

In [17]:
test_html = markdowner.convert(test_ml)

NameError: name 'markdowner' is not defined

In [18]:
print(test_html)

NameError: name 'test_html' is not defined

## Data cleansing the previous meetup data...

Here is the format of the table which we need to populate by cleaning up, seperating and formatting the data...

![Screenshot%202019-11-21%2008.17.59.png](attachment:Screenshot%202019-11-21%2008.17.59.png)

Using a bit of regular expression to get the job done...
First build out a test case with an example event entry (test_html)...

In [14]:
import re

In [15]:
# Get rid of the stuff we don't want...

pattern_list = ['<h2>(.*?)</h2>','[\n]', '[=]', '<p>']
test_html_c = test_html
for p in pattern_list:
    test_html_c = re.sub(p, "", test_html_c) 
print(test_html_c)

NameError: name 'test_html' is not defined

In [18]:
# Split out the info based on </p> and </ul> tags and make a list of strings

test_html_l = re.split('</p>|</ul>', test_html_c)
test_html_c = []
for t in test_html_l:
    test_html_c.append(t.strip())
print(test_html_c)

['2019-04-24 March Jersey Meetup', '17:30 - Unit 13, Le Capelain House, Castle Quay', 'Notes', "11 attendees! We're cooking on gas now!", "Huge thanks to CIPUG founding member Phil for providing space for this month's meet up.", 'General intros for new attendees.', 'Adrian demoed his initial version of his personal financial tracking project.', 'Rob demoed the Adafruit Pyportal', 'Resources', '<ul><li>[[https://anvil.works/|Anvil Drag &amp; Drop Python]]</li><li>[[https://pypi.org/project/cooked-input/|Cooked Input]]</li><li>[[https://getbootstrap.com/|Bootstrap]]</li><li>[[https://getuikit.com/|UIKit]]</li><li>[[https://www.polymer-project.org/|Polymer]]</li><li>[[https://webflow.com/|Webflow]]</li><li>[[https://blog.getpelican.com/|Pelican Static Site Generator]]</li><li>[[https://www.adafruit.com/micropython|MicroPython on Adafruit]]</li>', 'Not covered (held till next month?)', '<ul><li>Docker</li><li>Neural Net Course Review</li><li>How to contribute to Open Source</li>', '']


In [19]:
#Index the bits that we need in the notes and resources...

notes_i = test_html_c.index("Notes")
print("notes index: ", notes_i)
resources_i = test_html_c.index("Resources")
print("resources index: ", resources_i)

notes_i_content = test_html_c[notes_i+1:resources_i]

resources_i_content = test_html_c[resources_i+1]

notes index:  2
resources index:  8


In [20]:
# Wrap up the notes in <p> tags again and concatenate back to a string...
notes=""
for n in notes_i_content:
    if n.startswith("<ul>"):
        notes = notes+n+"</ul>"
    else:
        notes = notes+"<p>"+n+"</p>"
print(notes)

<p>11 attendees! We're cooking on gas now!</p><p>Huge thanks to CIPUG founding member Phil for providing space for this month's meet up.</p><p>General intros for new attendees.</p><p>Adrian demoed his initial version of his personal financial tracking project.</p><p>Rob demoed the Adafruit Pyportal</p>


In [30]:
print(resources_i_content)

<ul><li><a href="https://anvil.works/">Anvil Drag &amp; Drop Python</a></li><li><a href="https://pypi.org/project/cooked-input/">Cooked Input</a></li><li><a href="https://getbootstrap.com/">Bootstrap</a></li><li><a href="https://getuikit.com/">UIKit</a></li><li><a href="https://www.polymer-project.org/">Polymer</a></li><li><a href="https://webflow.com/">Webflow</a></li><li><a href="https://blog.getpelican.com/">Pelican Static Site Generator</a></li><li><a href="https://www.adafruit.com/micropython">MicroPython on Adafruit</a></li>


In [22]:
resources_i_content = resources_i_content.replace('[[', '<a href="')
resources_i_content = resources_i_content.replace('|', '">')
resources_i_content = resources_i_content.replace(']]', '</a>')
resources = resources_i_content+"</ul>"
print(resources)

<ul><li><a href="https://anvil.works/">Anvil Drag &amp; Drop Python</a></li><li><a href="https://pypi.org/project/cooked-input/">Cooked Input</a></li><li><a href="https://getbootstrap.com/">Bootstrap</a></li><li><a href="https://getuikit.com/">UIKit</a></li><li><a href="https://www.polymer-project.org/">Polymer</a></li><li><a href="https://webflow.com/">Webflow</a></li><li><a href="https://blog.getpelican.com/">Pelican Static Site Generator</a></li><li><a href="https://www.adafruit.com/micropython">MicroPython on Adafruit</a></li></ul>


In [23]:
# # Format the resources to create hyperlinks...
# # Crap need a bit of clever regex here... [[https://anvil.works/|Anvil Drag &amp; Drop Python]] to 
# # <a href=“https://anvil.works”>Anvil Drag &amp; Drop Python</a>
# # \[{:2}


# new_string = re.sub(r'\[{2}' ,'<a href="', resources_i_content)
# new_string = re.sub('\|', '">', new_string)
# resources = re.sub(r'\]{2}', '</a>', new_string)
# resources = resources+"</ul>"
# print(resources)

In [24]:
# Get the info from the indices...

header = test_html_c[0]
place = test_html_c[1]

In [25]:
# Extract the date from the list...

from datetime import datetime as dt

match = re.search(r'\d{4}-\d{2}-\d{2}', header)
date = dt.strptime(match.group(), '%Y-%m-%d').date()

In [26]:
# Find the no of attendees (thanks Dave)...

attendees = re.search(r"(\d+)\sattendees",notes)
attendees = attendees[1]
print(attendees)

11


In [27]:
date = date  # Date format
print("date: ", date, "\n")
place = place  # String format
print("place:", place, "\n")
agenda = ""  # String format
print("agenda: ", agenda, "\n")
notes = notes  # String format
print("notes: ", notes, "\n")
resources = resources  # String Format
print("resources: ", resources, "\n")
attended = 0  # Int format
print("attended: ", attendees)

date:  2019-04-24 

place: 17:30 - Unit 13, Le Capelain House, Castle Quay 

agenda:   

notes:  <p>11 attendees! We're cooking on gas now!</p><p>Huge thanks to CIPUG founding member Phil for providing space for this month's meet up.</p><p>General intros for new attendees.</p><p>Adrian demoed his initial version of his personal financial tracking project.</p><p>Rob demoed the Adafruit Pyportal</p> 

resources:  <ul><li><a href="https://anvil.works/">Anvil Drag &amp; Drop Python</a></li><li><a href="https://pypi.org/project/cooked-input/">Cooked Input</a></li><li><a href="https://getbootstrap.com/">Bootstrap</a></li><li><a href="https://getuikit.com/">UIKit</a></li><li><a href="https://www.polymer-project.org/">Polymer</a></li><li><a href="https://webflow.com/">Webflow</a></li><li><a href="https://blog.getpelican.com/">Pelican Static Site Generator</a></li><li><a href="https://www.adafruit.com/micropython">MicroPython on Adafruit</a></li></ul> 

attended:  11


In [24]:
# Export to JSON
# content_df.to_json(r'old_site_data.json')