In [1]:
import mwclient
import time

site = mwclient.Site('en.wikipedia.org')
page = site.pages['Apple Inc.']

In [2]:
revs = list(page.revisions())

In [3]:
# most recent edit 
revs[0]

OrderedDict([('revid', 1237626086),
             ('parentid', 1237359266),
             ('user', 'Varoart2005'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=7, tm_mday=30, tm_hour=18, tm_min=23, tm_sec=42, tm_wday=1, tm_yday=212, tm_isdst=-1)),
             ('comment', '/* Ownership */')])

In [4]:
revs = sorted(revs, key=lambda rev: rev["timestamp"]) 

In [5]:
revs[0]

OrderedDict([('revid', 234249),
             ('parentid', 0),
             ('user', '212.53.104.xxx'),
             ('anon', ''),
             ('timestamp',
              time.struct_time(tm_year=2001, tm_mon=11, tm_mday=3, tm_hour=13, tm_min=19, tm_sec=6, tm_wday=5, tm_yday=307, tm_isdst=-1)),
             ('comment', '*')])

In [6]:
# load sentiment model 

from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

  from jax import xla_computation as _xla_computation
  from jax import xla_computation as _xla_computation
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [7]:
find_sentiment("I love you")

0.9998656511306763

In [8]:
find_sentiment("neutral")

-0.9992249011993408

In [9]:
edits = {}

for rev in revs:
    # Convert the timestamp to a formatted date string (YYYY-MM-DD)
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    
    # Initialize a new entry in the 'edits' dictionary if the date does not already exist
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    # Increment the edit count for the current date
    edits[date]["edit_count"] += 1
    
    # Retrieve the comment from the revision, defaulting to an empty string if not found
    comment = rev.get("comment", "")
    
    # Analyze the sentiment of the comment and add it to the list of sentiments for the current date
    edits[date]["sentiments"].append(find_sentiment(comment))


In [10]:
from statistics import mean

for key in edits:
    # Check if there are any sentiments recorded for the current date
    if len(edits[key]["sentiments"]) > 0:
        # Calculate the average sentiment for the current date
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        
        # Calculate the proportion of negative sentiments (sentiments less than 0)
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        # If there are no sentiments, set sentiment values to 0
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    
    # Remove the 'sentiments' list from the dictionary as it's no longer needed
    del edits[key]["sentiments"]


In [11]:
edits

{'2001-11-03': {'edit_count': 2,
  'sentiment': 0.9885351061820984,
  'neg_sentiment': 0.0},
 '2002-02-09': {'edit_count': 1,
  'sentiment': 0.6330787539482117,
  'neg_sentiment': 0.0},
 '2002-02-11': {'edit_count': 2,
  'sentiment': 0.8641649484634399,
  'neg_sentiment': 0.0},
 '2002-02-25': {'edit_count': 1,
  'sentiment': 0.9796351790428162,
  'neg_sentiment': 0.0},
 '2002-03-01': {'edit_count': 3,
  'sentiment': 0.34675365686416626,
  'neg_sentiment': 0.3333333333333333},
 '2002-03-20': {'edit_count': 1,
  'sentiment': 0.9803916215896606,
  'neg_sentiment': 0.0},
 '2002-05-23': {'edit_count': 3,
  'sentiment': 0.3432354728380839,
  'neg_sentiment': 0.3333333333333333},
 '2002-06-01': {'edit_count': 6,
  'sentiment': 0.9803916215896606,
  'neg_sentiment': 0.0},
 '2002-06-06': {'edit_count': 2,
  'sentiment': 0.9803916215896606,
  'neg_sentiment': 0.0},
 '2002-06-11': {'edit_count': 2,
  'sentiment': -0.007433682680130005,
  'neg_sentiment': 0.5},
 '2002-07-05': {'edit_count': 1,
  '

In [12]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [13]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2001-11-03,2,0.988535,0.000000
2002-02-09,1,0.633079,0.000000
2002-02-11,2,0.864165,0.000000
2002-02-25,1,0.979635,0.000000
2002-03-01,3,0.346754,0.333333
...,...,...,...
2024-07-22,7,-0.238651,0.571429
2024-07-27,2,0.934352,0.000000
2024-07-28,6,0.165639,0.333333
2024-07-29,5,0.893641,0.000000


In [14]:
# Convert the index of the DataFrame 'edits_df' to datetime format
edits_df.index = pd.to_datetime(edits_df.index)

In [15]:
from datetime import datetime

dates = pd.date_range(start="2001-10-24",end=datetime.today())

In [16]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [17]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2001-10-24,0,0.0,0.0
2001-10-25,0,0.0,0.0
2001-10-26,0,0.0,0.0
2001-10-27,0,0.0,0.0
2001-10-28,0,0.0,0.0
...,...,...,...
2024-08-07,0,0.0,0.0
2024-08-08,0,0.0,0.0
2024-08-09,0,0.0,0.0
2024-08-10,0,0.0,0.0


In [18]:
# Calculate the rolling mean of the DataFrame 'edits_df' over a 30-day window.
# The 'min_periods=30' ensures that at least 30 data points are required for the calculation.
rolling_edits = edits_df.rolling(30, min_periods=30).mean()


In [19]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2001-10-24,,,
2001-10-25,,,
2001-10-26,,,
2001-10-27,,,
2001-10-28,,,
...,...,...,...
2024-08-07,1.100000,0.008265,0.213492
2024-08-08,1.066667,-0.016672,0.213492
2024-08-09,1.033333,-0.041610,0.213492
2024-08-10,1.033333,-0.041610,0.213492


In [20]:
rolling_edits = rolling_edits.dropna()

In [21]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2001-11-22,0.066667,0.032951,0.000000
2001-11-23,0.066667,0.032951,0.000000
2001-11-24,0.066667,0.032951,0.000000
2001-11-25,0.066667,0.032951,0.000000
2001-11-26,0.066667,0.032951,0.000000
...,...,...,...
2024-08-07,1.100000,0.008265,0.213492
2024-08-08,1.066667,-0.016672,0.213492
2024-08-09,1.033333,-0.041610,0.213492
2024-08-10,1.033333,-0.041610,0.213492
