In [1]:
# Imports
import sys
import os
import platform
from os.path import exists
from watermark import watermark

import torch
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Set the device
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

Using device: mps


In [3]:
# Report Technologies
print(f'Python Platform: {platform.platform()}')
print(f'Python {sys.version}')
print(watermark())
print(watermark(iversions=True, globals_=globals()))

# Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
print(f"PyTorch version: {torch.__version__}")
print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
print(f"Is MPS available? {torch.backends.mps.is_available()}")

Python Platform: macOS-13.0.1-arm64-arm-64bit
Python 3.9.15 (main, Nov 24 2022, 08:28:41) 
[Clang 14.0.6 ]
Last updated: 2023-05-07T20:55:00.916294-05:00

Python implementation: CPython
Python version       : 3.9.15
IPython version      : 8.6.0

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 22.1.0
Machine     : arm64
Processor   : arm
CPU cores   : 10
Architecture: 64bit

platform: 1.0.8
torch   : 1.13.0
sys     : 3.9.15 (main, Nov 24 2022, 08:28:41) 
[Clang 14.0.6 ]
pandas  : 1.5.2

PyTorch version: 1.13.0
Is MPS (Metal Performance Shader) built? True
Is MPS available? True


In [4]:
# Using wikipedia BTC page edit history as proxy for public interest and sentiment
# - additional potential sources for 'sentiment analysis' => tweets, google trends
"""
    Downloading wikipedia edits for BTC  """

'\n    Downloading wikipedia edits for BTC  '

In [5]:
import mwclient  # Module for interacting with MediaWiki API.
import time  # Module for handling time-related operations.

# Initialize a MediaWiki client site which is a class that enables work with a specific wiki site,
# in this case, the English (en) version of Wikipedia.
site = mwclient.Site('en.wikipedia.org')  # Replace 'en' with the appropriate language code for other sites.

# Specify which page to use by creating a Page object using the page's title.
page = site.pages['Bitcoin']  # Replace 'Bitcoin' with the title of alternatively desired Wikipedia page.

#

In [6]:
# Obtain from Wikipedia the list of revisions for the specified page.
revs = list(page.revisions())  # Returns a list of dictionaries containing information about each revision of the page.
                              # Each dictionary contains keys such as 'user', 'comment', 'timestamp', etc. that provide
                              # details about the revision. The list is sorted in reverse chronological order by default.
#

In [7]:
# Look at the first set of revisions for the specified page.
# Outputs an ordered dictionary, which is a Python class that functions as a hybrid-like list and dictionary combined.
revs[0]  # Returns a dictionary containing information about the first revision of the page.
         # The dictionary contains keys such as 'user', 'comment', 'timestamp', etc. that provide details about the revision.
         # The specific information returned depends on the MediaWiki API and the parameters used to query the revisions.


OrderedDict([('revid', 1151233254),
             ('parentid', 1149274508),
             ('minor', ''),
             ('user', 'Rodw'),
             ('timestamp',
              time.struct_time(tm_year=2023, tm_mon=4, tm_mday=22, tm_hour=18, tm_min=46, tm_sec=9, tm_wday=5, tm_yday=112, tm_isdst=-1)),
             ('comment',
              'Disambiguating links to [[Central American University]] (link changed to [[Central American University (San Salvador)]]) using [[User:Qwertyytrewqqwerty/DisamAssist|DisamAssist]].')])

In [8]:
# Sort the list of revisions for the specified page in ascending order based on their timestamps.
# The sorted() function is used here with a key parameter, which takes a function that returns a value to sort by.
# In this case, the lambda function returns the timestamp value from each revision dictionary.
revs = sorted(revs, key=lambda rev: rev['timestamp'])  # Returns the same list of revision dictionaries, but sorted
                                                       # in ascending order by their timestamps.

In [9]:
# Look at the first revision in the sorted/reordered list of revisions for the specified page.
# Outputs an ordered dictionary, which is a Python class that functions as a hybrid-like list and dictionary combined.
revs[0]  
# Returns a dictionary containing information about the first revision of the page in the sorted list.

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

# Identifying the 'sentiment'of the wikipedia page edits

In [10]:
# import the regular expressions module
import re

# modified pattern string
fixed_pattern = "(.*?)-\\d{5}-of-\\d{5}"

# create a regular expression object using the fixed pattern string
re_obj = re.compile(fixed_pattern)

# use the regular expression object to search for matches in a string
text = "This is a test string-12345-of-67890."
match = re_obj.search(text)

print(match.group(0))


This is a test string-12345-of-67890


In [None]:
# Provides for pipeline to 'deep learning' models; requires TensorFlow 2.0 or PyTorch
from transformers import pipeline  # import the transformers library, which includes the pipeline function for running pre-trained models

# initialize the Deep Learning sentiment analysis model using the pipeline function from transformers
# reduced maximum input length to a smaller value (e.g., 256)
# switch to PyTorch by setting the framework parameter to "pt"
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", tokenizer="distilbert-base-uncased", framework="pt", max_length=256)


In [None]:
from typing import Dict, List, Tuple, Any  # import List and Tuple types from the typing module for type hinting

def find_sentiment(text: List[str]) -> List[float]:
    """
    This function takes in a list of strings of text and returns a list of sentiment scores between -1 and 1,
    where negative values indicate negative sentiment and positive values indicate positive sentiment.
    """
    
    # Run a list of texts through the sentiment analysis model and get a list of sentiment scores, 
    # each score is a dictionary containing the sentiment label ('POSITIVE' or 'NEGATIVE') and score (a float between 0 and 1)

    sents: List[Dict[str, Any]] = sentiment_pipeline([text[:259]])[0]  # run the text through the sentiment analysis model and get the result
    scores: List[float] = []
    for sent in sents:
        score: float = sent['score']
        if sent['label'] == 'NEGATIVE':
            score *= -1
        scores.append(score)
    return scores


In [None]:
# Test the sentiment analysis model
# Define a list of text strings to analyze
texts = [
    "This is a positive sentence.",
    "This is a negative sentence.",
    "This is a neutral sentence.",
]

# Call the find_sentiment function to analyze the texts
sentiment_scores = find_sentiment(texts)

# Print the sentiment scores
print(sentiment_scores)


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Set the model and tokenizer names
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer_name = "distilbert-base-uncased"

# Load the pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Create a sentiment analysis pipeline using the loaded model and tokenizer
sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [None]:

# Define the 'edits' dictionary
edits: Dict[str, Dict[str, Any]] = {}

# Iterate through the revisions in the sorted list 'revs'
for rev in revs:
    # Get the date of the revision using the 'timestamp' attribute
    date: str = time.strftime('%Y-%m-%d', rev['timestamp'])
    
    # Check if the date is not in the 'edits' dictionary
    if date not in edits:
        # If the date is not in the 'edits' dictionary, create a new dictionary with the following attributes:
        # - 'sentiments': a list to store the sentiment analysis scores
        # - 'edt_count': an integer to store the number of edits made on the date
        edits[date]: Dict[str, Any] = {'sentiments': [], 'edt_count': 0}
        
    # Increment the 'edt_count' by 1 for the current date
    edits[date]['edt_count'] += 1
        
    # Get the comment for the revision, if it exists
    comment: str = rev.get('comment', '')
    
    # Call the 'find_sentiment' function to get the sentiment score for the comment
    sentiment_score: float = find_sentiment(comment)
    
    # Append the sentiment score to the 'sentiments' list for the current date
    edits[date]['sentiments'].append(sentiment_score)


In [None]:
from statistics import mean  # import mean function from statistics module

# iterate through each key in the edits dictionary
for key in edits:
    
    # check if there are any sentiments present for the current key
    if len(edits[key]['sentiments']) > 0:
        
        # calculate the mean sentiment for the current key
        edits[key]['sentiment'] = mean(edits[key]['sentiments'])
        
        # calculate the negative sentiment score as a ratio of the total sentiment scores
        edits[key]['percent_neg_sentiment'] = len([s for s in edits[key]['sentiments'] if s < 0]) / len(edits[key]['sentiments'])
                                           
    # if there are no sentiments present for the current key
    else:
        
        # set the sentiment score to 0
        edits[key]['sentiment'] = 0
                                           
        # set the negative sentiment score to 0
        edits[key]['neg_sentiment'] = 0

        # remove the 'sentiments' key from the current key in the 'edits' dictionary
        del edits[key]['sentiments']

In [None]:
#

#

#

#

In [None]:
#BLOCK ALPHA
# Define the 'edits' dictionary
edits: Dict[str, Dict[str, Any]] = {}

# Iterate through the revisions in the sorted list 'revs'
for rev in revs:
    # Get the date of the revision using the 'timestamp' attribute
    date: str = time.strftime('%Y-%m-%d', rev['timestamp'])
    
    # Check if the date is not in the 'edits' dictionary
    if date not in edits:
        # If the date is not in the 'edits' dictionary, create a new dictionary with the following attributes:
        # - 'sentiments': a list to store the sentiment analysis scores
        # - 'edt_count': an integer to store the number of edits made on the date
        edits[date]: Dict[str, Any] = {'sentiments': [], 'edt_count': 0, 'sentiment': 0, 'percent_neg_sentiment': 0}
        
    # Increment the 'edt_count' by 1 for the current date
    edits[date]['edt_count'] += 1
        
    # Get the comment for the revision, if it exists
    comment: str = rev.get('comment', '')
    
    # Call the 'find_sentiment' function to get the sentiment score for the comment
    sentiment_score: float = find_sentiment(comment)
    
    # Append the sentiment score to the 'sentiments' list for the current date
    edits[date]['sentiments'].append(sentiment_score)

#BLOCK BETA
from statistics import mean  # import mean function from statistics module


# iterate through each key in the edits dictionary
for key in edits:
    
    # calculate the mean sentiment for the current key
    if 'sentiments' in edits[key] and len(edits[key]['sentiments']) > 0:
        edits[key]['sentiment'] = mean(edits[key]['sentiments'])
        edits[key]['percent_neg_sentiment'] = len([s for s in edits[key]['sentiments'] if s < 0]) / len(edits[key]['sentiments'])
        
    # if there are no sentiments present for the current key
    else:
        edits[key]['sentiment'] = 0
        edits[key]['percent_neg_sentiment'] = 0
        
        # remove the 'sentiments' key from the current key in the 'edits' dictionary
        if 'sentiments' in edits[key]:
            del edits[key]['sentiments']

        
        
#

In [None]:
# Define the 'edits' dictionary
edits: Dict[str, Dict[str, Any]] = {}

# Iterate through the revisions in the sorted list 'revs'
for rev in revs:
    # Get the date of the revision using the 'timestamp' attribute
    date: str = time.strftime('%Y-%m-%d', rev['timestamp'])
    
    # Check if the date is not in the 'edits' dictionary
    if date not in edits:
        # If the date is not in the 'edits' dictionary, create a new dictionary with the following attributes:
        # - 'sentiments': a list to store the sentiment analysis scores
        # - 'edt_count': an integer to store the number of edits made on the date
        edits[date]: Dict[str, Any] = {'sentiments': [], 'edt_count': 0}
        
    # Increment the 'edt_count' by 1 for the current date
    edits[date]['edt_count'] += 1
        
    # Get the comment for the revision, if it exists
    comment: str = rev.get('comment', '')
    
    # Call the 'find_sentiment' function to get the sentiment score for the comment
    sentiment_score: float = find_sentiment(comment)
    
    # Append the sentiment score to the 'sentiments' list for the current date
    edits[date]['sentiments'].append(sentiment_score)


In [None]:
from statistics import mean  # import mean function from statistics module

# iterate through each key in the edits dictionary
for key in edits:
    
    # check if there are any sentiments present for the current key
    if len(edits[key]['sentiments']) > 0:
        
        # calculate the mean sentiment for the current key
        edits[key]['sentiment'] = mean(edits[key]['sentiments'])
        
        # calculate the negative sentiment score as a ratio of the total sentiment scores
        edits[key]['percent_neg_sentiment'] = len([s for s in edits[key]['sentiments'] if s < 0]) / len(edits[key]['sentiments'])
                                           
    # if there are no sentiments present for the current key
    else:
        
        # set the sentiment score to 0
        edits[key]['sentiment'] = 0
                                           
        # set the negative sentiment score to 0
        edits[key]['neg_sentiment'] = 0

        # remove the 'sentiments' key from the current key in the 'edits' dictionary
        del edits[key]['sentiments']


In [None]:
# 'edits' is Key value Pair/s dictionary, each key => date
# there are 3 key values; 'edt_count', 'sentiment', and 'neg_sentiment'
# 'edt_count' = number of times BTC wikipedia page comments was edited on a given day
# 'sentiment' = average sentiment for that day
# 'percent_neg_sentiment' = percentage of edits that express sentiment negativity on that given day

# edits

# generate a dataframe with sentiment data
transform -
structure -
clean-up dataframe

In [None]:
# Import pandas module for creating and manipulating dataframes
import pandas as pd

# Create a dataframe 'edits_df' from the dictionary 'edits' with index orientation set to 'index'
# takes in list of dictionaries, orient on 'index' ensures that each dictionary is a seperate row of the dataframe
BTC_edits_df = pd.DataFrame.from_dict(edits, orient='index')

In [None]:
BTC_edits_df.head()

# Addressing the problem of gaps in dates of activity in the dataframe
there are data gaps within the dataframe

i.e. asset or security is traded more often than days of comments or sentiment generating activity.

In [None]:
from DateTime import DateTime

dates = pd.date_range(start= '2009-03-08', end= datetime.now())