In [1]:
import os
import csv
import pandas as pd

In [2]:
folder = './pages/'

fn_list = []
dn_list = []
d_revisions = []

In [3]:
# Obtain full list of directories from the old site
for (dirpath, dirnames, filenames) in os.walk(folder):
    for d in dirnames:
        dn = os.path.join(dirpath, d)
        dn_list.append(dn)
dn_list.sort()

In [4]:
# If 'revisions' is in a directory, get the last revision file with dir path
for d in dn_list:
    if "revisions" in d:
        temp_list = []
        for (dirpath, dirnames, filenames) in os.walk(d):
            for f in filenames:
                fn = os.path.join(dirpath, f)
                temp_list.append(fn)
        temp_list.sort()
        fn_list.append(temp_list[-1])
fn_list.sort()

In [5]:
# Create a table with all the relevant data from the last revision file
content_table = []
for fn in fn_list:
    (dirname, filename) = os.path.split(fn)
    open_file = open(fn, 'r')
    file_text = open_file.read()
    table_row = [dirname, file_text]
    content_table.append(table_row)

In [6]:
# Convert table to CSV
with open("output.csv", "w", newline="") as output:
    writer = csv.writer(output)
    writer.writerows(content_table)

# Note if needed: 
# os.path.dirname(os.path.dirname(fn))  # directory of directory of file

In [7]:
# Create a Pandas dataframe
content_df = pd.DataFrame.from_records(content_table)
content_df.columns = ["File_Path", "Content"]
display(content_df)

Unnamed: 0,File_Path,Content
0,./pages/2019(2d)02(2d)21(2d)initial(2d)jersey(...,= 2019-02-21 Initial Jersey Meetup =\n\n17:30 ...
1,./pages/2019(2d)03(2d)26(2d)march(2d)jersey(2d...,= 2019-03-23 March Jersey Meetup =\n\n17:30 Di...
2,./pages/2019(2d)04(2d)24(2d)april(2d)jersey(2d...,## page was renamed from 2019-04-24-march-jers...
3,./pages/2019(2d)04(2d)24(2d)march(2d)jersey(2d...,Page moved to correct URL: \n\nhttps://cipug.o...
4,./pages/2019(2d)05(2d)may(2d)jersey(2d)meetup\...,= 2019-05-23 March Jersey Meetup =\n\n17:30 - ...
5,./pages/2019(2d)06(2d)june(2d)jersey(2d)meetup...,= 2019-06-26 June Jersey Meetup =\n\n17:30 - U...
6,./pages/2019(2d)07(2d)july(2d)jersey(2d)meetup...,= 2019-07 July Jersey Meetup =\n\n17:30 - Unit...
7,./pages/2019(2d)08(2d)august(2d)jersey(2d)meet...,= 2019-08-29 August Jersey Meetup =\n\n17:30 -...
8,./pages/2019(2d)09(2d)september(2d)jersey(2d)m...,= 2019-09-?? September Jersey Meetup =\n\n17:3...
9,./pages/2019(2d)10(2d)october(2d)jersey(2d)mee...,= 2019-10-24 October Jersey Meetup =\n\n17:30 ...


### Interpreting the data.

Row index 0:10 are the notes from the 10 meetings to date. The Content column data is marked up in the format from the old site. We can parse through this data to reformat into the new site. For example '/n' for new line can be replaced with the equivelant in html (<br />) or (<p> <p/>)if this is how it is rendered in the new site. The headers wrapped in '=' could be replaced with header tags.

We will be able to drop the following rows [10, 13, 16] as the content is only relevant to the old site (@rcwd to confirm).  

In [8]:
content_df.drop(content_df.index[[10,13,16]])

Unnamed: 0,File_Path,Content
0,./pages/2019(2d)02(2d)21(2d)initial(2d)jersey(...,= 2019-02-21 Initial Jersey Meetup =\n\n17:30 ...
1,./pages/2019(2d)03(2d)26(2d)march(2d)jersey(2d...,= 2019-03-23 March Jersey Meetup =\n\n17:30 Di...
2,./pages/2019(2d)04(2d)24(2d)april(2d)jersey(2d...,## page was renamed from 2019-04-24-march-jers...
3,./pages/2019(2d)04(2d)24(2d)march(2d)jersey(2d...,Page moved to correct URL: \n\nhttps://cipug.o...
4,./pages/2019(2d)05(2d)may(2d)jersey(2d)meetup\...,= 2019-05-23 March Jersey Meetup =\n\n17:30 - ...
5,./pages/2019(2d)06(2d)june(2d)jersey(2d)meetup...,= 2019-06-26 June Jersey Meetup =\n\n17:30 - U...
6,./pages/2019(2d)07(2d)july(2d)jersey(2d)meetup...,= 2019-07 July Jersey Meetup =\n\n17:30 - Unit...
7,./pages/2019(2d)08(2d)august(2d)jersey(2d)meet...,= 2019-08-29 August Jersey Meetup =\n\n17:30 -...
8,./pages/2019(2d)09(2d)september(2d)jersey(2d)m...,= 2019-09-?? September Jersey Meetup =\n\n17:3...
9,./pages/2019(2d)10(2d)october(2d)jersey(2d)mee...,= 2019-10-24 October Jersey Meetup =\n\n17:30 ...


In [9]:
# Export to JSON
content_df.to_json(r'old_site_data.json')

In [11]:
content_df.iloc[0]["Content"]

'= 2019-02-21 Initial Jersey Meetup =\n\n17:30 Digital Jersey Hub - Edison Room (the board room)\n\n== Notes ==\n\n4 attendees.\n\nGeneral introductions and summary of Python experience to date.\n\nLack of visible Python activity compared to other cities / locations\n\nDiscussion of education - Scratch & Python. Agreement that, once established, the UG should look to assist & enable Python in education where possible.\n\nQuick chat on architectural patterns and how to structure our applications.\n\nAgreement that the UG should be kept quite informal, with presentations / speakers being exceptional events.\n\n== Resources ==\n\n * [[https://mode.com/|Mode.com]]\n * [[https://developers.google.com/web/tools/lighthouse/|Google Lighthouse]]\n * [[https://wiki.qt.io/Qt_for_Python|Pyside 2.0]]\n * [[https://www.edx.org/course/cs50s-introduction-computer-science-harvardx-cs50x|Harvard CS50]]\n * [[http://pythontutor.com/|Pythontutor.com]]\n * [[https://codingbat.com/python|Codingbat.com]]\n *

In [12]:
content_df.iloc[-1]["Content"]

'## Please edit system and help pages ONLY in the master wiki!\n## For more information, please see MoinMoin:MoinDev/Translation.\n##master-page:FrontPage\n#format wiki\n#language en\n#pragma section-numbers off\n= CIPUG Wiki =\n\nWhat is this wiki about?\n\nThe Channel Islands Python User Group!\n\nCurrently an unratified group but the aim is to cover Python development across the Channel Islands.\n\nMore [[about|about us]]\n\n== Sub Modules ==\n\n * [[Jersey]] - Jersey specific info and events\n * [[Guernsey]] - Guernsey specific info and events\n\n== Get Involved ==\n\nLooking for an account? Please see the [[RequestAccount|Request Account]] page for details.\n\nOtherwise please email rcwd@cipug.org or more info\n\n[[https://www.xkcd.com/353/|{{attachment:xkcd_python.png|I wrote 20 short programs in Python yesterday.  It was wonderful.  Perl, I\'m leaving you.}}]]\n\n[[https://www.xkcd.com/353/|xkcd: Python|target="_blank"]]\n'

In [13]:
content_df.iloc[6]["Content"]

'= 2019-07 July Jersey Meetup =\n\n17:30 - Unit 13, Le Capelain House, Castle Quay [[https://www.google.com/maps/@49.1824762,-2.1155164,20z|Map]]\n\n4 attendees, thanks to Phil for the use of his office (despite his not even being on the island!)\n\n== Notes ==\n\n * General Intros and chat\n * Discussion of Anvil performance, tracing using timing functions and why having database queries in a loop is a bad idea\n * Discussion of SQLness vs Excelness for structuring data\n * Quick demo of Docker and Python (code in the link below)\n * Discussion of MRM originated, CIPUG project to build a CMS for use on this site and by members\n * Demo of Github config for the same\n * Closing chat re data gathering tools and the possibility of a simple, common language to describe data\n\n== Resources ==\n\n * [[https://blog.jupyter.org/and-voil%C3%A0-f6a2c08a4a93|VoilÃ\xa0 - Jupyter Notebooks Dashboard]]\n * [[https://github.com/cipug/docker-demo|Docker demo code]]\n * [[https://github.com/cipug/lit