In [2]:
from mastodon import Mastodon
from dotenv import load_dotenv
import os
from hdfs import InsecureClient
import datetime

load_dotenv()

# Connect to the mastodon API
mastodon = Mastodon(
    client_id=os.getenv('MASTODON_CLIENT_ID'),
    client_secret=os.getenv('MASTODON_CLIENT_SECRET'),
    access_token=os.getenv('MASTODON_ACCESS_TOKEN'),
    api_base_url="https://mastodon.social"
)

# Initialize an HDFS client
hdfs_client = InsecureClient(os.getenv('HDFS_CLIENT_URL'), user=os.getenv('HDFS_CLIENT_USER'))


# get current date and time
now = datetime.datetime.now()
directory_path = '/raw/'+ str(now.year) + '-' + str(now.month) + '-' + str(now.day)

# Check if the directory already exists
if not hdfs_client.status(directory_path, strict=False):
    hdfs_client.makedirs(directory_path)

# Define the HDFS path where you want to save the data
hdfs_path = directory_path + '/' + str(now.hour) + '-' + str(now.minute) + '/mastodon.txt'

public_posts = mastodon.timeline_public(limit=10)

i = 0
# Create a text file to store the Mastodon data
with hdfs_client.write(hdfs_path) as writer:
    for post in public_posts:
        writer.write(f'Post{i}: {post} \n')
        i += 1

print('Data saved successfully to HDFS : ' + hdfs_path)

ConnectionError: HTTPConnectionPool(host='laptop-1us3gu3j.', port=9864): Max retries exceeded with url: /webhdfs/v1/raw/2023-10-19/8-52/mastodon.txt?op=CREATE&user.name=hadoop&namenoderpcaddress=0.0.0.0:9000&createflag=&createparent=true&overwrite=false&user.name=hadoop (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000021112621F10>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))

In [3]:
public_posts

[{'id': 111260595831428351,
  'created_at': datetime.datetime(2023, 10, 19, 7, 52, 39, tzinfo=tzutc()),
  'in_reply_to_id': 111260593625955885,
  'in_reply_to_account_id': 110730813077137029,
  'sensitive': False,
  'spoiler_text': '',
  'visibility': 'public',
  'language': 'en',
  'uri': 'https://neuromatch.social/users/fsandhaeger/statuses/111260595607015490',
  'url': 'https://neuromatch.social/@fsandhaeger/111260595607015490',
  'replies_count': 0,
  'reblogs_count': 0,
  'favourites_count': 0,
  'edited_at': None,
  'content': '<p>Choice signals dynamically shifted from sensory to motor areas over the course of a trial.</p>',
  'reblog': None,
  'account': {'id': 110730813077137029,
   'username': 'fsandhaeger',
   'acct': 'fsandhaeger@neuromatch.social',
   'display_name': 'fsandhaeger',
   'locked': False,
   'bot': False,
   'discoverable': True,
   'group': False,
   'created_at': datetime.datetime(2023, 7, 14, 0, 0, tzinfo=tzutc()),
   'note': '<p>neuroscientist in tübingen.

In [13]:
import csv


# Define the CSV file name
csv_file = 'mastodon_posts.csv'

# Define the CSV field names
fieldnames = [
    'id', 'created_at', 'in_reply_to_id', 'in_reply_to_account_id', 'sensitive',
    'spoiler_text', 'visibility', 'language', 'uri', 'url', 'replies_count',
    'reblogs_count', 'favourites_count', 'content', 'edited_at',
    'account_id', 'account_username', 'account_acct', 'account_display_name',
    'account_locked', 'account_bot', 'account_discoverable', 'account_group',
    'account_created_at', 'account_note', 'account_url', 'account_uri',
    'account_avatar', 'account_avatar_static', 'account_header', 'account_header_static',
    'account_followers_count', 'account_following_count', 'account_statuses_count',
    'account_last_status_at',
    'media_attachments_id', 'media_attachments_type', 'media_attachments_url',
    'media_attachments_preview_url', 'media_attachments_remote_url', 'media_attachments_description',
    'media_attachments_blurhash'
]

# Create and open the CSV file for writing
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    
    # Write the header row
    writer.writeheader()
    
    # Write the data rows
    for post in public_posts:
        media_attachments = post.get('media_attachments', [])  # Get media attachments or an empty list
        media_attachment = media_attachments[0] if media_attachments else {}  # Get the first attachment or an empty dictionary
        
        writer.writerow({
            'id': post['id'],
            'created_at': post['created_at'],
            'in_reply_to_id': post['in_reply_to_id'],
            'in_reply_to_account_id': post['in_reply_to_account_id'],
            'sensitive': post['sensitive'],
            'spoiler_text': post['spoiler_text'],
            'visibility': post['visibility'],
            'language': post['language'],
            'uri': post['uri'],
            'url': post['url'],
            'replies_count': post['replies_count'],
            'reblogs_count': post['reblogs_count'],
            'favourites_count': post['favourites_count'],
            'content': post['content'],
            'edited_at': post['edited_at'],
            'account_id': post['account']['id'],
            'account_username': post['account']['username'],
            'account_acct': post['account']['acct'],
            'account_display_name': post['account']['display_name'],
            'account_locked': post['account']['locked'],
            'account_bot': post['account']['bot'],
            'account_discoverable': post['account']['discoverable'],
            'account_group': post['account']['group'],
            'account_created_at': post['account']['created_at'],
            'account_note': post['account']['note'],
            'account_url': post['account']['url'],
            'account_uri': post['account']['uri'],
            'account_avatar': post['account']['avatar'],
            'account_avatar_static': post['account']['avatar_static'],
            'account_header': post['account']['header'],
            'account_header_static': post['account']['header_static'],
            'account_followers_count': post['account']['followers_count'],
            'account_following_count': post['account']['following_count'],
            'account_statuses_count': post['account']['statuses_count'],
            'account_last_status_at': post['account']['last_status_at'],
            'media_attachments_id': media_attachment.get('id', ''),
            'media_attachments_type': media_attachment.get('type', ''),
            'media_attachments_url': media_attachment.get('url', ''),
            'media_attachments_preview_url': media_attachment.get('preview_url', ''),
            'media_attachments_remote_url': media_attachment.get('remote_url', ''),
            'media_attachments_description': media_attachment.get('description', ''),
            'media_attachments_blurhash': media_attachment.get('blurhash', '')
        })

print(f'Data has been saved to {csv_file}')

Data has been saved to mastodon_posts.csv


In [14]:
import pandas as pd
dffff=pd.read_csv('mastodon_posts.csv')
dffff

Unnamed: 0,id,created_at,in_reply_to_id,in_reply_to_account_id,sensitive,spoiler_text,visibility,language,uri,url,...,account_following_count,account_statuses_count,account_last_status_at,media_attachments_id,media_attachments_type,media_attachments_url,media_attachments_preview_url,media_attachments_remote_url,media_attachments_description,media_attachments_blurhash
0,111260595831428351,2023-10-19 07:52:39+00:00,1.112606e+17,1.107308e+17,False,,public,en,https://neuromatch.social/users/fsandhaeger/st...,https://neuromatch.social/@fsandhaeger/1112605...,...,368,21,2023-10-19 00:00:00,1.112606e+17,image,https://files.mastodon.social/cache/media_atta...,https://files.mastodon.social/cache/media_atta...,https://neuromatch.social/system/media_attachm...,,UFRC-=_4%MjaM{xukBf5%MNFkCWBRQoyt7ae
1,111260595817377032,2023-10-19 07:51:35+00:00,1.107808e+17,506247.0,False,,public,fr,https://mamot.fr/users/EnercoConseils/statuses...,https://mamot.fr/@EnercoConseils/1112605914251...,...,111,1676,2023-10-19 00:00:00,,,,,,,
2,111260595803718108,2023-10-19 07:52:40+00:00,,,False,,public,fr,https://lepoulsdumonde.com/users/bot_bp/status...,https://lepoulsdumonde.com/@bot_bp/11126059567...,...,0,1629,2023-10-19 00:00:00,1.112606e+17,image,https://files.mastodon.social/cache/media_atta...,https://files.mastodon.social/cache/media_atta...,https://i.lepoulsdumonde.com/media_attachments...,un média posté par le compte Twitter,UgIqoj^Is+S502kYRjV?xvMyRkoLj;xtofWX
3,111260595763020092,2023-10-19 07:52:41+00:00,,,False,,public,en,https://mastodon.online/users/50years_music/st...,https://mastodon.online/@50years_music/1112605...,...,292,35876,2023-10-19 00:00:00,,,,,,,
4,111260595722414032,2023-10-19 07:52:35+00:00,,,False,,public,ca,https://xarxa.cloud/users/lamarea/statuses/111...,https://xarxa.cloud/@lamarea/111260595365131330,...,2,21,2023-10-19 00:00:00,1.112606e+17,image,https://files.mastodon.social/cache/media_atta...,https://files.mastodon.social/cache/media_atta...,https://media.xarxa.cloud/media_attachments/fi...,,USBYU4xa9ZIoV@t6ofRj0KWB-pt7xuNGRjxu
5,111260595714821926,2023-10-19 07:52:40.614000+00:00,,,True,​,public,,https://misskey-square.net/notes/9l0g85uuh0,https://misskey-square.net/notes/9l0g85uuh0,...,74,2219,2023-10-19 00:00:00,,,,,,,
6,111260595696265450,2023-10-19 07:52:40.699000+00:00,,,False,,public,en,https://mastodon.social/users/malaysiagazette/...,https://mastodon.social/@malaysiagazette/11126...,...,0,10454,2023-10-19 00:00:00,,,,,,,
7,111260595622396574,2023-10-19 07:52:39.577000+00:00,,,False,,public,en,https://mastodon.social/users/masticadores/sta...,https://mastodon.social/@masticadores/11126059...,...,11,23,2023-10-19 00:00:00,,,,,,,
8,111260595618815953,2023-10-19 07:52:37+00:00,,,True,moustique,public,fr,https://mastodon.opportunis.me/users/Grandasse...,https://mastodon.opportunis.me/@Grandasse_/111...,...,340,18648,2023-10-19 00:00:00,,,,,,,
9,111260595613938456,2023-10-19 07:52:37+00:00,,,False,,public,en,https://kolektiva.social/users/phistorians/sta...,https://kolektiva.social/@phistorians/11126059...,...,695,729,2023-10-19 00:00:00,,,,,,,


In [10]:
dffff.columns

Index(['id', 'created_at', 'in_reply_to_id', 'in_reply_to_account_id',
       'sensitive', 'spoiler_text', 'visibility', 'language', 'uri', 'url',
       'replies_count', 'reblogs_count', 'favourites_count', 'content',
       'edited_at', 'account_id', 'account_username', 'account_acct',
       'account_display_name', 'account_locked', 'account_bot',
       'account_discoverable', 'account_group', 'account_created_at',
       'account_note', 'account_url', 'account_uri', 'account_avatar',
       'account_avatar_static', 'account_header', 'account_header_static',
       'account_followers_count', 'account_following_count',
       'account_statuses_count', 'account_last_status_at',
       'media_attachments_id', 'media_attachments_type',
       'media_attachments_url', 'media_attachments_preview_url',
       'media_attachments_remote_url', 'media_attachments_description',
       'media_attachments_blurhash'],
      dtype='object')

In [18]:
import csv

# Define the CSV file name
csv_file = 'mastodon_posts_analyzed.csv'

# Define the CSV field names for the selected columns
fieldnames = [
    'account_id', 'account_username', 'account_followers_count',
    'replies_count', 'reblogs_count', 'favourites_count',
    'account_created_at', 'url', 'language', 'content', 'media_attachments_type'
]

# Create and open the CSV file for writing
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    # Write the header row
    writer.writeheader()

    # Write the data rows
    for post in public_posts:
        writer.writerow({
            'account_id': post['account']['id'],
            'account_username': post['account']['username'],
            'account_followers_count': post['account']['followers_count'],
            'replies_count': post['replies_count'],
            'reblogs_count': post['reblogs_count'],
            'favourites_count': post['favourites_count'],
            'account_created_at': post['account']['created_at'],
            'url': post['url'],
            'language': post['language'],
            'content': post['content'],
            'media_attachments_type': post['media_attachments'][0]['type'] if post['media_attachments'] else None
        })

print(f'Data has been saved to {csv_file}')

Data has been saved to mastodon_posts_analyzed.csv


In [19]:
dfff=pd.read_csv('mastodon_posts_analyzed.csv')
dfff

Unnamed: 0,account_id,account_username,account_followers_count,replies_count,reblogs_count,favourites_count,account_created_at,url,language,content,media_attachments_type
0,110730813077137029,fsandhaeger,105,0,0,0,2023-07-14 00:00:00+00:00,https://neuromatch.social/@fsandhaeger/1112605...,en,<p>Choice signals dynamically shifted from sen...,image
1,506247,EnercoConseils,208,0,0,0,2018-08-30 00:00:00+00:00,https://mamot.fr/@EnercoConseils/1112605914251...,fr,"<p>Je vais vous donner mon opinion à mon tout,...",
2,109739396230937969,bot_bp,95,0,0,0,2023-01-23 00:00:00+00:00,https://lepoulsdumonde.com/@bot_bp/11126059567...,fr,<p>🍽️Les 70 ans de la Cocotte-Minute&nbsp;: ch...,image
3,110767005095620731,50years_music,300,0,0,0,2023-07-23 00:00:00+00:00,https://mastodon.online/@50years_music/1112605...,en,"<p>""Since I Don't Have You"" is a song written ...",
4,1086371,lamarea,1016,0,0,0,2020-01-17 00:00:00+00:00,https://xarxa.cloud/@lamarea/111260595365131330,ca,<p>Las fechas que maneja el PSOE para explicar...,image
5,111005821859858153,Oisii_kohaku,140,0,0,0,2023-09-04 00:00:00+00:00,https://misskey-square.net/notes/9l0g85uuh0,,<p><span>おまかの帝国</span></p>,
6,110512337728405631,malaysiagazette,47,0,0,0,2023-06-09 00:00:00+00:00,https://mastodon.social/@malaysiagazette/11126...,en,"<p>PWM tidak dihargai <a href=""https://mastodo...",
7,111209913632476178,masticadores,2,0,0,0,2023-10-10 00:00:00+00:00,https://mastodon.social/@masticadores/11126059...,en,"<p>your prey by Joni Karen Caggiano <a href=""h...",
8,836108,Grandasse_,313,0,0,0,2019-06-17 00:00:00+00:00,https://mastodon.opportunis.me/@Grandasse_/111...,fr,<p>Tellement saoulé de pas avoir pu déglinguer...,
9,109362321197812721,phistorians,1131,0,0,0,2022-11-18 00:00:00+00:00,https://kolektiva.social/@phistorians/11126059...,en,<p>✨Brand New Episode ✨</p><p>Episode 143: Spe...,


In [17]:
dfff.columns

Index(['account_id', 'account_username', 'account_followers_count',
       'replies_count', 'reblogs_count', 'favourites_count',
       'account_created_at', 'url', 'language', 'content'],
      dtype='object')