# Scraping Twitter data with search terms using snscrape

### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import jenkspy
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

In [3]:
%matplotlib inline
import json
import numpy as np
import altair as alt
import altair_latimes as lat

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

In [6]:
# https://medium.com/better-programming/how-to-scrape-tweets-with-snscrape-90124ed006af

In [7]:
# https://github.com/JustAnotherArchivist/snscrape

### Use snscrape to grab tweets mentioning a search term, in this case "doomscrolling"

In [8]:
# use --progress to see results in 100 tweet intervals

In [9]:
# !snscrape --jsonl twitter-search "doomscrolling until:2021-01-25 since:2020-01-01" > input/search/doomscrolling-tweets.json

In [10]:
src = pd.read_json("input/search/doomscrolling-tweets.json", lines=True)

### Process dates

In [11]:
src["date"] = pd.to_datetime(src["date"], utc=True)
src["pacific_created_at"] = pd.to_datetime(src["date"], utc=True).dt.tz_convert(
    "America/Los_Angeles"
)

In [12]:
src["date"] = pd.to_datetime(src["pacific_created_at"]).dt.strftime("%m/%d/%Y")
src["date"] = pd.to_datetime(src["date"])
src["year"] = src["pacific_created_at"].dt.year
src["month"] = src["pacific_created_at"].dt.month
src["day"] = src["pacific_created_at"].dt.day
src["hour"] = src["pacific_created_at"].dt.hour
src["minute"] = src["pacific_created_at"].dt.minute
src["time"] = src["pacific_created_at"].dt.time

In [13]:
src["id"] = src["id"].astype(str)
src["year"] = src["year"].astype(str)
src["month"] = src["month"].astype(str)
src["day"] = src["day"].astype(str)
src["hour"] = src["hour"].astype(str)

### Make a copy of the dataframe and ensure the text of the tweet contains 'doomscrolling'

In [14]:
doomscrolling = src[src["content"].str.contains("doomscrolling")].copy()

### Unpack the nested json

In [15]:
data_dict = doomscrolling.to_dict("records")

In [16]:
doomscrolling["user_name"] = pd.json_normalize(doomscrolling["user"])["username"]
doomscrolling["user_displayname"] = pd.json_normalize(doomscrolling["user"])[
    "displayname"
]
doomscrolling["user_verified"] = pd.json_normalize(doomscrolling["user"])["verified"]
doomscrolling["user_description"] = pd.json_normalize(doomscrolling["user"])[
    "description"
]
doomscrolling["user_followersCount"] = pd.json_normalize(doomscrolling["user"])[
    "followersCount"
]
doomscrolling["user_location"] = pd.json_normalize(doomscrolling["user"])["location"]

### Slim down the dataframe

In [17]:
doomscrolling_slim = doomscrolling[
    [
        "date",
        "content",
        "url",
        "replyCount",
        "retweetCount",
        "likeCount",
        "quoteCount",
        "user_name",
        "user_displayname",
        "user_verified",
        "user_description",
        "user_followersCount",
        "user_location",
        "month",
        "day",
        "hour",
        "minute",
        "time",
    ]
]

### How many 'doomscrolling' tweets?

In [18]:
len(doomscrolling_slim)

74607

### How many by @karenkho? 

In [19]:
len(doomscrolling_slim[doomscrolling_slim["user_name"] == "karenkho"])

365

In [20]:
doomscrolling_slim.head()

Unnamed: 0,date,content,url,replyCount,retweetCount,likeCount,quoteCount,user_name,user_displayname,user_verified,user_description,user_followersCount,user_location,month,day,hour,minute,time
1,2021-01-24,Are you still doomscrolling? #DoomScrolling,https://twitter.com/Gingerbsoapbox/status/1353492225309634560,0,0,0,0,eclecticeden,Ollie King 🏳️‍🌈,False,Artisanal-coffee-shop-craft-beer-snob-cabincore bad. He/him/his.,699.0,"Shoreditch, London",1,24,15,57,15:57:47
3,2021-01-24,"Is doomswiping a thing? Like doomscrolling, only spending too much time hopelessly swiping right on people you know you're never going to match with.",https://twitter.com/eclecticeden/status/1353489590712229890,0,0,0,0,SproBeforeBros,Sprodo Baggins,False,Jackson O'Brien. Expert. Biscuit Troll. He/him.,901.0,"Minneapolis, Minnesprota",1,24,15,47,15:47:19
4,2021-01-24,"Whelp. Back to shouting about politics, posting unoriginal and mediocre jokes and doomscrolling.",https://twitter.com/CatFishJohnny/status/1353488676760727553,3,0,8,0,boldtransDev,boldtransgamedev,False,"23 (she/her) Trans gal looking to do game dev awesomeness!\nLevel Designer, Game Designer, Programmer. I love everything about the diff fields in game dev",239.0,,1,24,15,43,15:43:41
5,2021-01-24,If you would like to interrupt your doomscrolling on Monday evening and know a lot about dead actors you can come and play trivia with me on the internet! It's a real good time! 7:00 Central at this link: https://t.co/RQ7Mb2meTA,https://twitter.com/SproBeforeBros/status/1353487523234852864,1,0,3,0,the__archmage,ΛRCHMΛGΞ,False,"Part time Streamer, full time Mage",34.0,Azeroth,1,24,15,39,15:39:06
6,2021-01-24,Oh its one of those days where half of game dev twitter becomes football twitter. Its the best way for me to stop doomscrolling and log off cause I can't understand any of it,https://twitter.com/boldtransDev/status/1353487111199072256,0,0,3,0,LukeDoesStuff,luke schwartz is nice now,True,for booking inquiries: yes please!!,1691.0,City of Angles,1,24,15,37,15:37:28


---

### Export

In [21]:
doomscrolling_slim.to_csv("output/search/doomscrolling_slim.csv", index=False)