# Data Mining for NFTs

- Scrap and store data from rarity tool website about trendy NFTs
- Find influencers in NFT and spot if they speak about a particular NFT
- Store information about Google trends

Idea for implementation
- When was the Twitter account created
- sentiment analysis on Twitts
- Volume
- Launch date of the NFT
- ETH value (if low buy more easily)

Packages that need to be installed
- !pip install pytrends
- !pip3 install snscrape

In [1]:
# Data manipulation and analysis library
import pandas as pd
import numpy as np

# Progress bar library
from tqdm import tqdm

# We will use snscrape from github.com/JustAnotherArchivist/snscrape to get data from Twitter
import snscrape.modules.twitter as sntwitter
from datetime import date, timedelta

# We use pytrend to fetch information from Google trend
from pytrends.request import TrendReq

# Scrap and store data from rarity tool website about trendy NFTs

In [2]:
# Import selenium webdriver and configure its options
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

options = Options()
options.headless = False
options.add_argument("--window-size=1920,1200")

driver = webdriver.Chrome(options=options, executable_path='./chromedriver')

In [3]:
# Get driver for rarity tool website
driver.get('https://rarity.tools/upcoming')

In [4]:
# Wait until page is loaded
delay = 10 # seconds
try:
    myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.XPATH, "//*[contains(@class,'text-left text-gray-800')]")))
    print("Rarity tool page is ready!")
except TimeoutException:
    print("Loading took longer than {0} seconds".format(delay))

# Find NFT elements
url_elements = driver.find_elements_by_xpath("//*[contains(@class,'text-left text-gray-800')]")

# Show first NFT
url_elements[0].text.split('\n')

Rarity tool page is ready!


['Bricktopians',
 '3D avatar NFTs generated by AI',
 'Discord',
 '@bricktopians',
 'bricktopians.com',
 '0.08 ETH',
 '10,000 Total',
 'Presale:',
 '1:30 pm (Europe/Berlin)',
 'Yesterday',
 'Sale:',
 '1:30 pm (Europe/Berlin)',
 'Tuesday, November 23rd 2021']

In [5]:
# Create a dataframe with NFTs from rarity tool
nft_list = []
for element in url_elements:
    nft_list.append(element.text.split('\n'))

df = pd.DataFrame(nft_list)

# Close Selenium driver
driver.close()

In [6]:
# Clean the name, description, website, volume, and twitter account
df['Name'] = df[0]
df['Description'] = df[1]
df['Website'] = df[4]
df['Twitter Profile'] = df[3].apply(lambda x: x[1:])

In [7]:
# Clean the price tag
def cleanPrice(x):
    if 'eth' in x.lower():
        return float(''.join([i for i in x if i.isnumeric() or i=='.']))
    else:
        return 0
df['Price ETH'] = df[5].apply(cleanPrice)

# Get data on each Twitter profile

In [10]:
# Using TwitterSearchScraper to scrape data and append tweets to list
profile_list = []

# Use TwitterSearchScraper to scrape data for each NFT
for twitter_name in tqdm(df['Twitter Profile']):
    if sntwitter.TwitterUserScraper.is_valid_username(twitter_name):
        try:
            items = next(sntwitter.TwitterProfileScraper('{0}'.format(twitter_name), isUserId=False).get_items())
        except AttributeError:
            print('Twitter profile {0} not found'.format(twitter_name))
        else:  
            profile_list.append([twitter_name, items.user.created, items.user.description, 
                                 items.user.favouritesCount, items.user.followersCount, 
                                 items.user.friendsCount, items.user.listedCount, items.user.statusesCount, 
                                 items.user.mediaCount, items.user.location, items.user.profileBannerUrl, 
                                 items.user.profileImageUrl])
    else:
        print('Twitter profile {0} is not a valid username'.format(twitter_name))

# Creating a dataframe from the tweets list above
profiles_df = pd.DataFrame(profile_list, columns=['Twitter Profile','Twitter User Created', 'Twitter User Description', 'Twitter User Favourites Count', 
                                                  'Twitter User Followers Count', 'Twitter User Friends Count', 'Twitter User Listed Count', 
                                                  'Twitter User Statuses Count', 'Twitter User Media Count', 'Twitter User Location', 
                                                  'Twitter User Profile Banner Url', 'Twitter User Profile Image Url'])

 22%|██▏       | 13/58 [00:17<01:01,  1.36s/it]

Twitter profile ainsei.com is not a valid username


 78%|███████▊  | 45/58 [00:58<00:17,  1.31s/it]

Twitter profile coochycoopanda?s=21 is not a valid username


100%|██████████| 58/58 [01:14<00:00,  1.29s/it]


In [11]:
# Merge and save dataframe
nft_df = df[['Twitter Profile', 'Name', 'Description', 'Website', 'Price ETH']].merge(profiles_df, on='Twitter Profile', how='left')

nft_df.to_csv('nft_data.csv', index=False)

# Fetch Tweets for each NFT

In [12]:
# Creating list to append tweet data
tweets_list = []

# We are interested to get tweets from the last 7 days
today = date.today().strftime("%Y-%m-%d")
last_month = (date.today() - timedelta(days=7)).strftime("%Y-%m-%d")

# Maximum tweets to scrap for each query
max_tweets = 5000

# Use TwitterSearchScraper to scrape data for each NFT
for search_query in tqdm(nft_df['Twitter Profile']):
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(
        '{0} since:{1} until:{2}'.format(search_query, last_month, today)).get_items()):
        if i > max_tweets:
                break
        tweets_list.append([search_query, tweet.date, tweet.id, 
                            tweet.content, tweet.user.username, tweet.user.created, 
                            tweet.user.description, tweet.user.favouritesCount, tweet.user.followersCount, 
                            tweet.user.friendsCount, tweet.user.listedCount, tweet.user.statusesCount, 
                            tweet.user.mediaCount, tweet.user.location, tweet.user.profileBannerUrl, 
                            tweet.user.profileImageUrl])

# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=['Twitter Profile', 'Datetime', 'Tweet Id', 'Text', 'Username',
                                               'User Created', 'User Description', 'User Favourites Count', 
                                               'User Followers Count', 'User Friends Count', 'User Listed Count', 
                                               'User Statuses Count', 'User Media Count', 'User Location', 
                                               'User Profile Banner Url', 'User Profile Image Url'])

100%|██████████| 58/58 [26:33<00:00, 27.47s/it]  


In [13]:
# Control if any of the NFTs has been recently mentionned by an influencer

# Influencer list taken on: 
# - https://coinbound.io/top-nft-influencers/
# - https://itsblockchain.com/top-15-nft-influencers-on-twitter-you-should-follow-right-now/

influencers = ['garyvee', 'DeezeFi', 'farokh', 'mevcollector', 'beaniemax', 'RealmissNFT', 
               'CozomoMedici', 'peruggia_v', 'andrwwang', 'iamDCinvestor', 'punk4156',  'punk6529', 
               'TheTreeverse', 'KennethBosak', 'tsmith', 'Bosslogic', 'NFTLive']

print('We found {0} influencers in the list'.format(len(tweets_df.loc[tweets_df['Twitter Profile'].isin(influencers)])))

We found 0 influencers in the list


In [14]:
# Store Twitter information
tweets_df.to_csv('tweets.csv', index=False)

# Store information about Google trends

In [15]:
# Article of reference: https://medium.com/geekculture/easily-gather-google-trends-data-in-python-22219cecd6fc

# hl: host language
# tz: timezone in minutes after UTC (positive only)
# timeout: between 5 and 10 sec if the server is not responding
pytrends = TrendReq(hl='en-US', tz=60, timeout=(5,10))

# keyword list
kw_list = ["Nft"]

# cat: google trend category (https://github.com/pat310/google-trends-api/wiki/Google-Trends-Categories)
# timeframe: 12 months ago from today
# geo represents the country in two-digit ISO codes (defaults to worldwide if left blank)
# gprop represents Google properties such as news, images, youtube etc. (defaults to web searches if left blank)
pytrends.build_payload(kw_list, cat=0, timeframe='today 12-m', geo='', gprop='')

# Interest over time
df_interest_over_time = pytrends.interest_over_time()

# Interest by Region
df_interest_by_region = pytrends.interest_by_region()

In [16]:
# Store current trend
df_interest_over_time.to_csv('google_trend_over_time.csv')
df_interest_by_region.to_csv('google_trend_per_region.csv')