<a href="https://colab.research.google.com/github/AvantiShri/gcp_analysis/blob/main/DownloadGCPdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#extract the table from https://global-mind.org/pred_formal.html
#(the "create a text file for download" link does not appear to be working)

%cd /content/drive/MyDrive/GCP_data
!wget https://global-mind.org/pred_formal.html -O pred_formal.html

/content/drive/MyDrive/GCP_data
--2023-09-06 10:11:23--  https://global-mind.org/pred_formal.html
Resolving global-mind.org (global-mind.org)... 162.245.217.130
Connecting to global-mind.org (global-mind.org)|162.245.217.130|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘pred_formal.html’

pred_formal.html        [  <=>               ] 184.64K   505KB/s    in 0.4s    

2023-09-06 10:11:23 (505 KB/s) - ‘pred_formal.html’ saved [189067]



In [2]:
%cd /content/drive/MyDrive/GCP_data
#sanity check
!head pred_formal.html

/content/drive/MyDrive/GCP_data
<!DOCTYPE html>
<html lang="en-US"><head>
<!-- SSI pageheaders content -->
<meta charset="utf-8" />
<meta http-equiv="cleartype" content="on" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<meta name="author" content="Roger Nelson" />
<meta name="keywords" content="consciousness,group consciousness,Global Consciousness Project,Roger Nelson,GCP,resonance,global consciousness,synchronized consciousness,mind,world,global,gaia,anomalies,parapsychology,psi,random event,REG,RNG,subtle energy,millennium" />
<meta http-equiv="imagetoolbar" content="no" />
<meta name="Classification" content="science" />


In [2]:
#parse the html
from bs4 import BeautifulSoup
soup = BeautifulSoup(open("/content/drive/MyDrive/GCP_data/pred_formal.html").read())

#the 'recipe' and 'statistic' columns don't always get parsed correctly due
# to the break, but we can work around it
rows = [[td.contents[0].rstrip() for td in row.find_all("td") ]
        for row in soup.body.table.tbody.find_all("tr")]

In [9]:
import os
import glob
from datetime import datetime, timedelta
import pandas
import numpy as np
import json

#change to a directory to download the data into
%cd /content/drive/MyDrive/GCP_data/raw

#populate days_downloaded based on the files that are currently present in
# the directory
days_downloaded = set([x[11:21] for x in glob.glob("basket*.csv")])

exclude_events = ["283", #this is New Year Mean 2009 from 2008-12-31 to 2009-01-01,
                        # being excluded because data for 2008-12-29 (needed for control 3)
                        # is absent: https://global-mind.org/data/eggsummary/2008/
                  "387", # Libya Day 1 Rebels Take Tripoli, 2011-08-21; similar situation - data seems to be missing/empty: https://global-mind.org/data/eggsummary/2011/
                  "388", #Event on 2011-08-23, similar problem
                  "469", #Event on 11/05 which had only 54000 rows of data for the day
                  "470", #Event on 11/07 which includes the 11/05 data in the control timeperiods
                  ]

pick_up_from = 469 #in case execution paused and you need to pick up from this event onwards
force_rerun = [] #events to rerun the download for

included_events = 0

#fetch data for all the relevant days. We will stick them together later
for row in rows:
  if row[-1].startswith("Yes") and row[-3].startswith("Stouffer Z"): #only keep those rows that were used in analysis and used Stouffer Z
    start, end = row[2], row[3]
    start_datetime = datetime.strptime(start, '%Y-%m-%d %H:%M:%S')
    end_datetime = datetime.strptime(end, '%Y-%m-%d %H:%M:%S')
    duration = 1 + (end_datetime - start_datetime).total_seconds() #in seconds

    print("\n------------------------------------")
    print(row[0], row[1], row[2], row[3]) #eventnum name start end
    print("Duration in seconds:", duration)
    print("------------------------------------")

    event_num = row[0]

    #filter for those durations ranging from 1hr to 1.5 days (upper limit for efficiency)
    if ((duration >= 60*60 and duration <= 36*60*60 and event_num not in exclude_events)):
      control1_datetime_start = start_datetime - timedelta(seconds=duration)
      control1_datetime_end = control1_datetime_start + timedelta(seconds=duration-1) #inclusive end
      control2_datetime_start = end_datetime + timedelta(seconds=1)
      control2_datetime_end = control2_datetime_start + timedelta(seconds=duration-1)

      control3_datetime_start = control1_datetime_start - timedelta(seconds=duration)
      control3_datetime_end = control3_datetime_start + timedelta(seconds=duration-1)
      control4_datetime_start = control2_datetime_end + timedelta(seconds=1)
      control4_datetime_end = control4_datetime_start + timedelta(seconds=duration-1)

      control5_datetime_start = control3_datetime_start - timedelta(seconds=duration)
      control5_datetime_end = control5_datetime_start + timedelta(seconds=duration-1)
      control6_datetime_start = control4_datetime_end + timedelta(seconds=1)
      control6_datetime_end = control6_datetime_start + timedelta(seconds=duration-1)

      print("Control1 start and end:", control1_datetime_start, control1_datetime_end)
      print("Control2 start and end:", control2_datetime_start, control2_datetime_end)
      print("Control3 start and end:", control3_datetime_start, control3_datetime_end)
      print("Control4 start and end:", control4_datetime_start, control4_datetime_end)
      print("Control5 start and end:", control5_datetime_start, control5_datetime_end)
      print("Control6 start and end:", control6_datetime_start, control6_datetime_end)

      included_events += 1

      #get all the days between control5 start and control6 end
      days = []
      start_day = datetime.strptime(control5_datetime_start.strftime('%Y-%m-%d'), '%Y-%m-%d')
      end_day = datetime.strptime(control6_datetime_end.strftime('%Y-%m-%d'), '%Y-%m-%d')
      day_to_add = start_day
      while True:
        if (day_to_add not in days):
          days.append(day_to_add.strftime('%Y-%m-%d'))
        if (day_to_add == end_day):
          break
        day_to_add = day_to_add + timedelta(days=1)
      print("Days:", days)

      #now download the relevant data
      for day in days:
        if (day not in days_downloaded) or (event_num in force_rerun):
          year = day[:4]
          filename = "basketdata-"+day
          !wget -r -nH --cut-dirs=3 --limit-rate=125k https://global-mind.org/data/eggsummary/{year}/{filename}.csv.gz
          !gunzip -f {filename}.csv.gz
          print("\nROWS:")
          !wc -l {filename}.csv
          days_downloaded.add(day)

      if int(event_num) >= pick_up_from:
        event_metadata = {}
        for spanname, startdt, enddt in [('test', start_datetime, end_datetime),
                                        ('control1', control1_datetime_start, control1_datetime_end),
                                        ('control2', control2_datetime_start, control2_datetime_end),
                                        ('control3', control3_datetime_start, control3_datetime_end),
                                        ('control4', control4_datetime_start, control4_datetime_end),
                                        ('control5', control5_datetime_start, control5_datetime_end),
                                        ('control6', control6_datetime_start, control6_datetime_end)]:
          numpy_outfile = '/content/drive/MyDrive/GCP_data/extracted/Event'+event_num+"_"+spanname+"_eggvalues.npy"
          if (os.path.exists(numpy_outfile)==False):
            #get all the days between control5 start and control6 end
            days = []
            start_day = datetime.strptime(startdt.strftime('%Y-%m-%d'), '%Y-%m-%d')
            end_day = datetime.strptime(enddt.strftime('%Y-%m-%d'), '%Y-%m-%d')
            day_to_add = start_day
            while True:
              if (day_to_add not in days):
                days.append(day_to_add.strftime('%Y-%m-%d'))
              if (day_to_add == end_day):
                break
              day_to_add = day_to_add + timedelta(days=1)
            print("Days:", days)
            #Now extract the data with pandas
            df = pandas.concat([pandas.read_csv("basketdata-"+day+".csv", skiprows=8)
                                          for day in days])
            print("Concatenated df has",len(df),"rows")
            filtered_rows = df[((df['gmtime'] >= startdt.timestamp())
                                & (df['gmtime'] <= enddt.timestamp()))]
            print("Num filtered rows in "+spanname+": ", len(filtered_rows))
            egg_values = filtered_rows.iloc[:, 3:]
            np.save(numpy_outfile, egg_values)
            num_rows = len(filtered_rows)
          else:
            num_rows = len(np.load(numpy_outfile))
          event_metadata[spanname] = {'start_timestamp':startdt.timestamp(),
                                      'end_timestamp':enddt.timestamp(),
                                      "num_rows":num_rows}
        #print out the metadata at the end
        with open('/content/drive/MyDrive/GCP_data/extracted/Event'+event_num+"_metadata.json", "w") as f:
          f.write(json.dumps(event_metadata, indent=4))
        #sanity check all spans have the same length
        assert len(set([x['num_rows'] for x in event_metadata.values()]))==1, event_metadata
    else:
      print("SKIPPING:", event_num)

print("Events included:", included_events)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
SKIPPING: 60

------------------------------------
61 Miss World 2000 2000-11-30 22:25:00 2000-11-30 22:29:59
Duration in seconds: 300.0
------------------------------------
SKIPPING: 61

------------------------------------
62 US Election 2000, b 2000-12-08 21:00:00 2000-12-08 21:04:59
Duration in seconds: 300.0
------------------------------------
SKIPPING: 62

------------------------------------
63 US Election 2000, Finality 2000-12-12 15:00:00 2000-12-12 15:14:59
Duration in seconds: 900.0
------------------------------------
SKIPPING: 63

------------------------------------
64 New Year Mean 2000-2001 2000-12-31 09:30:00 2001-01-01 12:29:59
Duration in seconds: 97200.0
------------------------------------
Control1 start and end: 2000-12-30 06:30:00 2000-12-31 09:29:59
Control2 start and end: 2001-01-01 12:30:00 2001-01-02 15:29:59
Control3 start and end: 2000-12-29 03:30:00 2000-12-30 06:29:59
Control4 start and end

In [10]:
included_events

393