In [1]:
import pandas as pd
import numpy as np
import snscrape.modules.twitter as sntwitter
import datetime
from tqdm.notebook import tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

# Taking User Input

In [2]:
text = input('Enter query text to be matched (or leave it blank by pressing enter)')
username = input('Enter specific username(s) from a twitter account without @ (or leave it blank by pressing enter): ')
since = input('Enter startdate in this format yyyy-mm-dd (or leave it blank by pressing enter): ')
until = input('Enter enddate in this format yyyy-mm-dd (or leave it blank by pressing enter): ')
count = int(input('Enter max number of tweets or enter -1 to retrieve all possible tweets: '))
retweet = input('Exclude Retweets? (y/n): ')
replies = input('Exclude Replies? (y/n): ')

Enter query text to be matched (or leave it blank by pressing enter)#tesla
Enter specific username(s) from a twitter account without @ (or leave it blank by pressing enter): 
Enter startdate in this format yyyy-mm-dd (or leave it blank by pressing enter): 2018-01-01
Enter enddate in this format yyyy-mm-dd (or leave it blank by pressing enter): 2020-12-31
Enter max number of tweets or enter -1 to retrieve all possible tweets: -1
Exclude Retweets? (y/n): y
Exclude Replies? (y/n): y


# List of Fields which we can Scrape using this Library

- url: str
- date: datetime.datetime
- rawContent: str
- renderedContent: str
- id: int
- user: 'User'
- replyCount: int
- retweetCount: int
- likeCount: int
- quoteCount: int
- conversationId: int
- lang: str
- source: str
- sourceUrl: typing.Optional[str] = None
- sourceLabel: typing.Optional[str] = None
- links: typing.Optional[typing.List['TextLink']] = None
- media: typing.Optional[typing.List['Medium']] = None
- retweetedTweet: typing.Optional['Tweet'] = None
- quotedTweet: typing.Optional['Tweet'] = None
- inReplyToTweetId: typing.Optional[int] = None
- inReplyToUser: typing.Optional['User'] = None
- mentionedUsers: typing.Optional[typing.List['User']] = None
- coordinates: typing.Optional['Coordinates'] = None
- place: typing.Optional['Place'] = None
- hashtags: typing.Optional[typing.List[str]] = None
- cashtags: typing.Optional[typing.List[str]] = None
- card: typing.Optional['Card'] = None


In [3]:
def search(text,username,since,until,retweet,replies):
    global filename
    q = text
    if username!='':
        q += f" from:{username}"    
    if until=='':
        until = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d')
    q += f" until:{until}"
    if since=='':
        since = datetime.datetime.strftime(datetime.datetime.strptime(until, '%Y-%m-%d') - 
                                           datetime.timedelta(days=7), '%Y-%m-%d')
    q += f" since:{since}"
    if retweet == 'y':
        q += f" exclude:retweets"
    if replies == 'y':
        q += f" exclude:replies"
    if username!='' and text!='':
        filename = f"{since}_{until}_{username}_{text}.csv"
    elif username!="":
        filename = f"{since}_{until}_{username}.csv"
    else:
        filename = f"{since}_{until}_{text}.csv"
    print(filename)
    return q

In [None]:
q = search(text,username,since,until,retweet,replies)
# Creating list to append tweet data 
tweets_list1 = []

# Using TwitterSearchScraper to scrape data and append tweets to list
if count == -1:
    for i,tweet in enumerate(tqdm_notebook(sntwitter.TwitterSearchScraper(q).get_items())):
        tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,
        tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media])
else:
    with tqdm_notebook(total=count) as pbar:
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(q).get_items()): #declare a username 
            if i>=count: #number of tweets you want to scrape
                break
            tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,
        tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media])
            pbar.update(1)
# Creating a dataframe from the tweets list above 
tweets_df1 = pd.DataFrame(tweets_list1, columns=['DateTime', 'TweetId', 'Text', 'Username','Language',
                                'Hashtags','ReplyCount','RetweetCount','LikeCount','QuoteCount','Media'])

In [None]:
tweets_df1.sort_values(by='DateTime',ascending=False)

# Data Preprocessing

In [None]:
tweets_df1.info()

# Adding more columns for time-series analysis

In [None]:
tweets_df1['Hour'] = tweets_df1['DateTime'].dt.hour
tweets_df1['Year'] = tweets_df1['DateTime'].dt.year  
tweets_df1['Month'] = tweets_df1['DateTime'].dt.month
tweets_df1['MonthName'] = tweets_df1['DateTime'].dt.month_name()
tweets_df1['MonthDay'] = tweets_df1['DateTime'].dt.day
tweets_df1['DayName'] = tweets_df1['DateTime'].dt.day_name()
tweets_df1['Week'] = tweets_df1['DateTime'].dt.isocalendar().week

# Splitting timestamp column into separate date and time columns 

In [None]:
tweets_df1['Date'] = [d.date() for d in tweets_df1['DateTime']]
tweets_df1['Time'] = [d.time() for d in tweets_df1['DateTime']]

# Dropping DateTime Column

In [None]:
tweets_df1.drop('DateTime',axis=1,inplace=True)
tweets_df1

# Saving as CSV file

In [None]:
tweets_df1.to_csv(f"{filename}",index=False)

# Loading Saved CSV file

In [None]:
#tweets = pd.read_csv(f'{filename}')
tweets = pd.read_csv("")
tweets

# Visualizing Count By Month

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
sns.countplot(x= tweets['Month'])
for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+20), fontsize = 12)

In [None]:
plt.figure(figsize=(15, 8))

ax=plt.subplot(221)
sns.lineplot(tweets.Month.value_counts())
ax.set_xlabel("Month")
ax.set_ylabel('Count')
plt.xticks(np.arange(1,13,1))

plt.subplot(222)
sns.histplot(x=tweets.Month,stat='count',binwidth=1,kde='true',discrete=True)
plt.xticks(np.arange(1,13,1))
plt.grid()

plt.subplot(223)
sns.kdeplot(x=tweets.Month,fill=True)
plt.xticks(np.arange(1,13,1))
plt.grid()

plt.subplot(224)
sns.kdeplot(x=tweets.Month,fill=True,bw_adjust=3)
plt.xticks(np.arange(1,13,1))
plt.grid()

plt.tight_layout()
plt.show()

# Visualizing Count By Week

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
sns.countplot(x= tweets['Week'])
for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x()+0.005, p.get_height()+5), fontsize = 10)

In [None]:
plt.figure(figsize=(15, 8))

ax=plt.subplot(221)
sns.lineplot(tweets.Week.value_counts())
ax.set_xlabel("Week")
ax.set_ylabel('Count')

plt.subplot(222)
sns.histplot(x=tweets.Week,stat='count',binwidth=1,kde='true',discrete=True)
plt.grid()

plt.subplot(223)
sns.kdeplot(x=tweets.Week,fill=True)
plt.grid()

plt.subplot(224)
sns.kdeplot(x=tweets.Week,fill=True,bw_adjust=3)
plt.grid()

plt.tight_layout()
plt.show()

# Visualizing Count By MonthDay

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
sns.countplot(x= tweets['MonthDay'])
for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)

In [None]:
plt.figure(figsize=(15, 8))

ax=plt.subplot(221)
sns.lineplot(tweets.MonthDay.value_counts())
ax.set_xlabel("MonthDay")
ax.set_ylabel('Count')

plt.subplot(222)
sns.histplot(x=tweets.MonthDay,stat='count',binwidth=1,kde='true',discrete=True)
plt.grid()

plt.subplot(223)
sns.kdeplot(x=tweets.MonthDay,fill=True)
plt.grid()

plt.subplot(224)
sns.kdeplot(x=tweets.MonthDay,fill=True,bw_adjust=3)
plt.grid()

plt.tight_layout()
plt.show()

# Visualizing Count By Hour

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
sns.countplot(x= tweets['Hour'])
for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+20), fontsize = 12)

In [None]:
plt.figure(figsize=(15, 8))

ax=plt.subplot(221)
sns.lineplot(tweets.Hour.value_counts())
ax.set_xlabel("Hour")
ax.set_ylabel('Count')
plt.xticks(np.arange(0,24,1))

plt.subplot(222)
sns.histplot(x=tweets.Hour,stat='count',binwidth=1,kde='true',discrete=True)
plt.xticks(np.arange(0,24,1))
plt.grid()

plt.subplot(223)
sns.kdeplot(x=tweets.Hour,fill=True)
plt.xticks(np.arange(0,24,1))
plt.grid()

plt.subplot(224)
sns.kdeplot(x=tweets.Hour,fill=True,bw_adjust=3)
#plt.xticks(np.arange(0,24,1))
plt.grid()

plt.tight_layout()
plt.show()