# Get Tweets

This script extracts all the tweets with hashtag #covid-19 related to the day before today (yesterday) and saves them into a .csv file.
We use the `tweepy` library, which can be installed with the command `pip install tweepy`.

Firstly, we import the configuration file, called `config.py`, which is located in the same directory of this script.

In [1]:
import os
jv = os.environ.get('JAVA_HOME', None)

# import findspark
# findspark.init()

# os.environ['PYSPARK_SUBMIT_ARGS'] = \
# '--packages com.johnsnowlabs.nlp:spark-nlp_2.12:3.0.0 pyspark-shell'
# # '--packages org.postgresql:postgresql:42.1.1 pyspark-shell'


In [2]:
import sys, glob, os
# sys.path.extend(glob.glob(os.path.join(os.path.expanduser("~"), ".ivy2/jars/*.jar")))
sys.path

['/home/hadoopuser/Documents/twitter_search_api',
 '/home/hadoopuser/.vscode/extensions/ms-toolsai.jupyter-2021.6.999662501/pythonFiles/vscode_datascience_helpers',
 '/home/hadoopuser/.vscode/extensions/ms-toolsai.jupyter-2021.6.999662501/pythonFiles',
 '/home/hadoopuser/.vscode/extensions/ms-toolsai.jupyter-2021.6.999662501/pythonFiles/lib/python',
 '/opt/anaconda/envs/pyspark_env/lib/python37.zip',
 '/opt/anaconda/envs/pyspark_env/lib/python3.7',
 '/opt/anaconda/envs/pyspark_env/lib/python3.7/lib-dynload',
 '',
 '/opt/anaconda/envs/pyspark_env/lib/python3.7/site-packages',
 '/opt/anaconda/envs/pyspark_env/lib/python3.7/site-packages/IPython/extensions',
 '/home/hadoopuser/.ipython']

In [92]:
import mypy
from config import *
import tweepy
import datetime

import logging

logger = logging.getLogger('tweets_search')

In [4]:
import pandas as pd
# import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [5]:
print(f"logger.root.level = {logger.root.level}, logger.root.name = {logger.root.name}")
print(f"logger.name = {logger.name}")

logger.root.level = 30, logger.root.name = root
logger.name = tweets_search


In [7]:
# format = "%(asctime)s - %(levelname)s - %(message)s"
# logging.basicConfig(format=format, stream=sys.stdout, level = logging.DEBUG)
logging.basicConfig(format=format, stream=sys.stdout, level = logging.INFO)

In [6]:
print(logger.root.level)

30


In [17]:
# logger.root.level = 10

In [18]:
# print(logger.root.level)

10


We setup the connection to our Twitter App by using the `OAuthHandler()` class and its `access_token()` function. Then we call the Twitter API through the `API()` function.

In [7]:
auth = tweepy.OAuthHandler(TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET)
auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET)
api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify = True)

In [10]:
# api.me()

In [11]:
# api.rate_limit_status()

## setup dates (recent 7 days max)


If today is 2021-06-26 then :

1. `time_frame = {timedelta:'2'}` (we get tweets from 2021-0-24 up to 2021-06-25 (today - 1 day))
2. `time_frame = {since:'2021-06-23', timedelta:'2'}` 
3. `time_frame = {until:'2021-06-25', timedelta:'2'}` (2 & 3 & 4 expressions are equivalent)
4. `time_frame = {since:'2021-06-23', until:'2021-06-25'}` -> we get tweets from 2021-06-23 up to 2021-06-24

`note:` from today we can get a time_frame of 7 days max, i.e since 2021-06-19

In [30]:
today = datetime.date.today()
since= today - datetime.timedelta(days=1)
until= today
until, since

logger.warning(f"full_text: '{until, since}'")

full_text: '(datetime.date(2021, 6, 26), datetime.date(2021, 6, 25))'


In [117]:

def time_frame(until=None, since=None, timedelta=0):
    print(until, since, timedelta)
    if isinstance(timedelta, str):
        print(timedelta)
        try:
            timedelta = int(timedelta)
            logger.warning(f"timedelta: '{timedelta}'")
        except (ValueError, TypeError):
            print(f'ValueError: "timedelta = {timedelta}". It must be an int')
    # print((timedelta))
    # if isinstance(int(timedelta), int):
    #     print(timedelta)
    #     if (1 <= timedelta <=7) and since==None:
    #         today = datetime.date.today()
    #         since= today - datetime.timedelta(days=timedelta)
    #         until= today
    #         logger.warning(f"time_frame: '{until, since}'") # until, since
            
               
    # today = datetime.date.today()
    # since= today - datetime.timedelta(days=1)
    # until= today
    # until, since
# (datetime.date(2021, 6, 7), datetime.date(2021, 6, 6))

In [118]:

timedelta = '2'
print(timedelta)
time_frame(timedelta = timedelta)
# print(timedelta)
# time_frame(timedelta = 'a')

timedelta: '2'
2
None None 2
2


### type checking with mypy

In [100]:

def headline(text: str, align: bool = True) -> str:
    if not align:
        return f"{text.title()}\n{'-' * len(text)}"
    else:
        return f" {text.title()} ".center(50, "o")

print(headline("python type checking", False))
print(headline("use mypy", align="center"))

Python Type Checking
--------------------
oooooooooooooooooooo Use Mypy oooooooooooooooooooo


In [98]:
help(headline)

Help on function headline in module __main__:

headline(text: str, align: bool = True) -> str



In [6]:
from __future__ import annotations
import datetime
from typing import List, Set, Dict, Tuple, Text, Optional, AnyStr
import mypy 
def time_frame1(until:datetime=None, since:datetime=None, timedelta:int=0) -> tuple[datetime]:
    today = datetime.date.today()
    # print((timedelta))
    # if isinstance(int(timedelta), int):
    #     print(timedelta)
    #     if (1 <= timedelta <=7) and since==None:
    #         today = datetime.date.today()
    #         since= today - datetime.timedelta(days=timedelta)
    #         until= today
    #         logger.warning(f"time_frame: '{until, since}'") # until, since
    if isinstance(timedelta, int) and 0 < timedelta <=7:    
        if until == None and since == None:        
            today = datetime.date.today()
            since= today - datetime.timedelta(days=timedelta)
            until= today
        if until == None and isinstance(since, datetime.date): 
            if 0 < (today - since).days <= 7:
                until= since + datetime.timedelta(days=timedelta)
    return until, since
# (datetime.date(2021, 6, 7), datetime.date(2021, 6, 6))

In [8]:
today = datetime.date.today()

In [9]:
time_frame1(since= today - datetime.timedelta(days=3), timedelta=2)

(datetime.date(2021, 6, 28), datetime.date(2021, 6, 26))

In [155]:
# until:datetime = None
until:datetime = datetime.date.today()
since:datetime=None
timedelta=0

if until is  None:
    print('yes is None')

if isinstance(until, datetime.date):
    print('yes date')

yes date


In [122]:
help(time_frame1)

Help on function time_frame1 in module __main__:

time_frame1(until: 'datetime' = None, since: 'datetime' = None, timedelta=0) -> 'tuple[datetime]'



In [128]:
until, since = time_frame1(timedelta=2)
print(until, since)

2021-06-26 2021-06-24


In [88]:
def evaluate_arguments_to_calculate_slope(point):
    """
    Evaluate three conditions of point to see if we can later use this point to calculate the slope of a line
    
    Keyword arguments:
    point -- tuple or list of x-y coordinates of a point
    """
    precondition_statuses = []
    
    # validate each data structure is a list or tuple
    condition_status = isinstance(point, (tuple, list))
    precondition_statuses.append(("tuple or list data structure", condition_status))
    
    # validate there are two values in that data structure
    condition_status = len(point) == 2
    precondition_statuses.append(("two values in data structures", condition_status))
    
    '''
    Validate the two values in that data struxture are floats or ints.
    Create a list comprehension to create a new list of two Boolean values.
    Logic returns True if the value is a float or int and False if neither data type
    '''
    digit_statuses = [isinstance(value, (float, int)) for value in point]
    # returns True if both items in list are boolean True values; otherwise, returns False
    condition_status = all(digit_statuses)
    precondition_statuses.append(("ints or floats", condition_status))
        
    return precondition_statuses

In [84]:
def all_argument_conditions_met(condition_results):
    """
    Evalute booleans of conditions
    
    Keyword arguments:
    condition_results -- list of tuples of (condition name, boolean status)
    """
    conditions_pass = True
    
    for condition in condition_results:
        if condition[1] is False:
            conditions_pass = False
    return conditions_pass

In [91]:
point_one = (1,2)
cont = evaluate_arguments_to_calculate_slope(point_one)
print(cont)

[('tuple or list data structure', True), ('two values in data structures', True), ('ints or floats', True)]


In [13]:
logger.debug(f"full_text: '{until, since}'")

We search for tweets on Twitter by using the `Cursor()` function. 
We pass the `api.search` parameter to the cursor, as well as the query string, which is specified through the `q` parameter of the cursor.
The query string can receive many parameters, such as the following (not mandatory) ones:
* `from:` - to specify a specific Twitter user profile
* `since:` - to specify the beginning date of search
* `until:` - to specify the ending date of search
The cursor can also receive other parameters, such as the language and the `tweet_mode`. If `tweet_mode='extended'`, all the text of the tweet is returned, otherwise only the first 140 characters.

In [None]:
# # example 
# code tweets = tweepy.Cursor(api.search, tweet_mode=’extended’) 
# for tweet in tweets:
#     content = tweet.full_text

In [None]:
# tweets_list = tweepy.Cursor(api.search, q="#Covid-19 since:" + str(yesterday)+ " until:" + str(today),tweet_mode='extended', lang='en').items()

In [None]:
# tweets_list = tweepy.Cursor(api.search, q=f"#Covid-19 since:{str(yesterday)} until:{str(today)}",tweet_mode='extended', lang='en').items()

### Get all related tweets from `date:since` to `date:until(not_incluted)`

In [131]:
# tweets_list = tweepy.Cursor(api.search, q=['astrazeneca', 'pfizer'],since= str(since), until=str(until),tweet_mode='extended', lang='en').items()

# Greek Language = el
# tweets_list = tweepy.Cursor(api.search, q=['coffee island'],since= str(since), until=str(until),tweet_mode='extended', lang='el').items()

# English Language = en
# tweets_list = tweepy.Cursor(api.search, q=['coffee island OR CoffeeIsland'],since= str(since), until=str(until),tweet_mode='extended', lang='en').items()

tweets_list = tweepy.Cursor(api.search, q=['astrazeneca OR pfizer'],since= str(since), until=str(until),tweet_mode='extended', lang='en').items()

Now we loop across the `tweets_list`, and, for each tweet, we extract the text, the creation date, the number of retweets and the favourite count. We store every tweet into a list, called `output`.

In [26]:
# import time
# seconds = 5
# start = time.time()
# time.sleep(seconds)
# end = time.time()
# logger.info(f"elapsed_time: '{end - start}'")

2021-06-25 03:00:56,957 - INFO - elapsed_time: '5.004919528961182'


---
# TEST

In [17]:
# tweets_list2 = tweepy.Cursor(api.search, q=['pfizer','astrazeneca'],since= str(since), until=str(until),tweet_mode='extended', lang='en').items(2)

import time
start = time.time()
output = []
for tweet in tweets_list:
    # text = tweet._json["full_text"]
    #print(text) 
    # https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/api-reference/get-search-tweets           
    # "geo": null,"coordinates": null,"place": null,"contributors": null,
    # "is_quote_status": false,"retweet_count": 988,"favorite_count": 3875,
    # "favorited": false,"retweeted": false,"possibly_sensitive": false,"lang": "en"
    # https://developer.twitter.com/en/docs/twitter-ids
    logger.info(f"tweet_id_str: {'-'*30}")
    logger.info(f"created_at: {tweet.created_at}")
    logger.info(f"full_text: {tweet._json['full_text']}")
    logger.info(f"tweet_id: {tweet.id}")
    logger.info(f"tweet_id_str: {tweet.id_str}")
    logger.info(f"user: {tweet._json['user']['name']}")
    logger.info(f"user_id: {tweet._json['user']['id']}")    
    # favourite_count = tweet.favorite_count
    # retweet_count = tweet.retweet_count
    # created_at = tweet.created_at
    
#     line = {'text' : text, 'favourite_count' : favourite_count, 'retweet_count' : retweet_count, 'created_at' : created_at}
#     output.append(line)
#     logger.info(f"Append list length : { len(output)}")
# end = time.time()
# logger.info(f"elapsed_time: '{end - start}'")

2021-06-24 23:18:06,420 - DEBUG - PARAMS: {'q': b"['coffee island']", 'since': b'2021-06-22', 'until': b'2021-06-24', 'tweet_mode': b'extended', 'lang': b'gr'}
2021-06-24 23:18:06,428 - DEBUG - Signing request <PreparedRequest [GET]> using client <Client client_key=pw0ihLFxH3nwDrd4HBd7pqUrc, client_secret=****, resource_owner_key=1360011857969479682-iLrxBUlqdtExwkqiN9iZsHYDXIFTZz, resource_owner_secret=****, signature_method=HMAC-SHA1, signature_type=AUTH_HEADER, callback_uri=None, rsa_key=None, verifier=None, realm=None, encoding=utf-8, decoding=None, nonce=None, timestamp=None>
2021-06-24 23:18:06,429 - DEBUG - Including body in call to sign: False
2021-06-24 23:18:06,442 - DEBUG - Collected params: [('q', "['coffee island']"), ('since', '2021-06-22'), ('until', '2021-06-24'), ('tweet_mode', 'extended'), ('lang', 'gr'), ('oauth_nonce', '89097120203846973971624565886'), ('oauth_timestamp', '1624565886'), ('oauth_version', '1.0'), ('oauth_signature_method', 'HMAC-SHA1'), ('oauth_consum

---

In [132]:
import time
start = time.time()
output = []
for tweet in tweets_list:
    text = tweet._json["full_text"]
    #print(text) 
    # https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/api-reference/get-search-tweets           
    # "geo": null,"coordinates": null,"place": null,"contributors": null,
    # "is_quote_status": false,"retweet_count": 988,"favorite_count": 3875,
    # "favorited": false,"retweeted": false,"possibly_sensitive": false,"lang": "en"
    logger.debug(f"full_text: '{text}'")
    favourite_count = tweet.favorite_count
    retweet_count = tweet.retweet_count
    created_at = tweet.created_at
    
    line = {'text' : text, 'favourite_count' : favourite_count, 'retweet_count' : retweet_count, 'created_at' : created_at}
    output.append(line)
    logger.info(f"Append list length : { len(output)}")
end = time.time()
logger.info(f"elapsed_time: '{end - start}'")

In [133]:
output

ext': '@AdyLady9969 I got my first Pfizer dose at Claremont about two weeks agom due for my second Thursday week. I had no side effects aside from a sore arm for two days. The Pfizer is fine.',
  'favourite_count': 0,
  'retweet_count': 0,
  'created_at': datetime.datetime(2021, 6, 25, 23, 59, 6)},
 {'text': "RT @Habia_2: Israeli researchers: 'Rare autoimmune disease' linked to Pfizer Covid-19 vaccine https://t.co/zaG3ahBQUe",
  'favourite_count': 0,
  'retweet_count': 9,
  'created_at': datetime.datetime(2021, 6, 25, 23, 59, 2)},
 {'text': 'RT @ke11ybender: Maddie is a Pfizer trial participant that is now disabled. https://t.co/s8EaHi99GO',
  'favourite_count': 0,
  'retweet_count': 546,
  'created_at': datetime.datetime(2021, 6, 25, 23, 59, 1)},
  'favourite_count': 0,
  'retweet_count': 75,
  'created_at': datetime.datetime(2021, 6, 25, 23, 59)},
 {'text': 'Get your first dose of the Pfizer #COVID19 Vaccine from 10am-2pm TOMORROW at Humphrey Middle School in Bolingbrook.\n\nVisit ht

In [71]:
len(output)

685

---
### create pdf from list

In [85]:
pdf = pd.DataFrame(output)


In [86]:
pdf.shape

(685, 4)

In [87]:
pdf.head(2)

Unnamed: 0,text,favourite_count,retweet_count,created_at
0,The #Nanaimo 7-10 Club is running a cooling ce...,0,0,2021-06-25 23:37:24
1,@LegendaryEnergy Barrel aged puna coffee from ...,0,0,2021-06-25 22:49:10


In [88]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   text             685 non-null    object        
 1   favourite_count  685 non-null    int64         
 2   retweet_count    685 non-null    int64         
 3   created_at       685 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 21.5+ KB


Selecting specific columns in a pandas dataframe

In [89]:
pdf[['text', 'created_at']].head(2)

Unnamed: 0,text,created_at
0,The #Nanaimo 7-10 Club is running a cooling ce...,2021-06-25 23:37:24
1,@LegendaryEnergy Barrel aged puna coffee from ...,2021-06-25 22:49:10


In [90]:
pdf.count()

text               685
favourite_count    685
retweet_count      685
created_at         685
dtype: int64

In [91]:
pdf[['text', 'created_at']].groupby('created_at').first().count()

text    668
dtype: int64

In [83]:
# pdf['created_at'] = pdf['created_at'].dt.date

In [84]:
# pdf[['text', 'created_at']].groupby('created_at').first()

Unnamed: 0_level_0,text
created_at,Unnamed: 1_level_1
2021-06-19,@BelTel The Government of Ireland can go to an...
2021-06-20,RT @RottenInDenmark: The indigenous population...
2021-06-21,"RT @cathsherman: #Bahamas Fishing Pier, Great ..."
2021-06-22,@iwantbrewster Commission someone to draw Brew...
2021-06-23,@Schreibernews We are too.\nBig hoopla when Pl...
2021-06-24,Stop by the @nespressousa store after my docto...
2021-06-25,The #Nanaimo 7-10 Club is running a cooling ce...


In [65]:
# pdf[['text', 'created_at']].groupby('created_at').first().count()

text    7
dtype: int64

In [66]:
# pdf[['text', 'created_at']].groupby('created_at').count()

Unnamed: 0_level_0,text
created_at,Unnamed: 1_level_1
2021-06-19,94
2021-06-20,83
2021-06-21,79
2021-06-22,183
2021-06-23,52
2021-06-24,80
2021-06-25,114


---
### save and read pdf without header

`index` = By default, when your data is saved, Pandas will include your index. This can be very annoying because when you load up your data again, your index will be there as a new column. I highly recommend setting `index`= false unless you have a specific reason not to.

`header` = Say you wanted to switch your column names, then you can specify what you want your columns to be called here. This should be a list of the same length as the number of columns in your data.

In [92]:
pdf.to_csv('output_cof_island.csv', header = False, index = False )
#df.to_csv('output.csv') #mode='a', 

In [93]:
pdf2 = pd.read_csv('output_cof_island.csv', names=['text',	'favourite_count',	'retweet_count','created_at'], parse_dates=['created_at',])
pdf.head(5)

Unnamed: 0,text,favourite_count,retweet_count,created_at
0,The #Nanaimo 7-10 Club is running a cooling ce...,0,0,2021-06-25 23:37:24
1,@LegendaryEnergy Barrel aged puna coffee from ...,0,0,2021-06-25 22:49:10
2,Back on the saddle with a big tall glass of co...,2,0,2021-06-25 22:46:38
3,"RT @cathsherman: #Bahamas Fishing Pier, Great ...",0,31,2021-06-25 22:17:23
4,@SuperWeenieHtJr You in Rhode Island? You need...,1,0,2021-06-25 21:31:06


In [94]:
pdf2.tail(3)

Unnamed: 0,text,favourite_count,retweet_count,created_at
682,RT @CatherineSher10: Dauphin Island Sunset\nCl...,0,62,2021-06-19 01:12:39
683,RT @CatherineSher10: A statue of Hawaiian King...,0,28,2021-06-19 00:20:52
684,Small Town Coffee! Loving the island life! htt...,0,0,2021-06-19 00:19:21


In [95]:
pdf2.shape

(685, 4)

---
## save a pdf to a csv file with header

In [46]:
#df = pd.DataFrame(output)
pdf.to_csv('output_cof_island.csv', mode='a', header=True, index = False)

---
## Create a pdf from a csv file with header

---
### pdf

In [39]:
pdf_cof_island2 = pd.read_csv('output_cof_island.csv', parse_dates=['created_at',])
pdf_cof_island2.head(5)

Unnamed: 0,text,favourite_count,retweet_count,created_at
0,RT @NikosPachilas: Mikel και Coffee Island αντ...,0,2,2021-06-24 12:26:01
1,RT @NikosPachilas: Mikel και Coffee Island αντ...,0,2,2021-06-24 12:20:35
2,RT @RealTime_eu: Mikel και Coffee Island ανταγ...,0,2,2021-06-24 12:20:27
3,RT @RealTime_eu: Mikel και Coffee Island ανταγ...,0,2,2021-06-24 12:19:07
4,Mikel και Coffee Island ανταγωνίζονται στην αγ...,2,2,2021-06-24 12:15:01


In [23]:
pdf_cof_island2.shape

(14, 4)

In [25]:
pdf_cof_island2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   text             14 non-null     object        
 1   favourite_count  14 non-null     int64         
 2   retweet_count    14 non-null     int64         
 3   created_at       14 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 576.0+ bytes


#### Convert a pdf datetime column to date

In [101]:
pdf2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text             685 non-null    object
 1   favourite_count  685 non-null    int64 
 2   retweet_count    685 non-null    int64 
 3   created_at       685 non-null    object
dtypes: int64(2), object(2)
memory usage: 21.5+ KB


In [96]:
pdf2['created_at'] = pdf2['created_at'].dt.date

In [97]:
pdf2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text             685 non-null    object
 1   favourite_count  685 non-null    int64 
 2   retweet_count    685 non-null    int64 
 3   created_at       685 non-null    object
dtypes: int64(2), object(2)
memory usage: 21.5+ KB


In [99]:
pdf2['created_at'][0]

datetime.date(2021, 6, 25)

In [29]:
# pdf_cof_island2 = pd.to_csv('output_cof_island.csv', date_format='%Y-%m-%d')
# 
# pdf_cof_island2.head(5)

In [100]:
pdf2.describe()

Unnamed: 0,favourite_count,retweet_count
count,685.0,685.0
mean,4.975182,72.442336
std,68.720497,494.795372
min,0.0,0.0
25%,0.0,0.0
50%,0.0,1.0
75%,1.0,31.0
max,1737.0,8756.0


---
### def sentiment_scores 

In [103]:
import sys
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def sentiment_scores(sentance: str) -> dict :
    # Create a SentimentIntensityAnalyzer object.
    sid = SentimentIntensityAnalyzer()
    # polarity_scores method of SentimentIntensityAnalyzer
    # oject gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    r = sid.polarity_scores(sentance);
    return r

---
### pdf

#### create a new column with sentiment_scores

In [104]:
#df3['rating'] = df3['text'].apply(sid.polarity_scores)

pdf2['rating'] = pdf2['text'].apply(sentiment_scores)

In [105]:
pdf2.head(2)

Unnamed: 0,text,favourite_count,retweet_count,created_at,rating
0,The #Nanaimo 7-10 Club is running a cooling ce...,0,0,2021-06-25,"{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp..."
1,@LegendaryEnergy Barrel aged puna coffee from ...,0,0,2021-06-25,"{'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp..."


In [33]:
pdf.tail(2)

Unnamed: 0,text,favourite_count,retweet_count,created_at,rating
6508,"RT @JohnRHewson: For the record, how many of o...",0,890,2021-06-19 00:00:27,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
6509,RT @DrEricDing: 3) “After years of reading res...,0,55,2021-06-19 00:00:08,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [106]:
pdf2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text             685 non-null    object
 1   favourite_count  685 non-null    int64 
 2   retweet_count    685 non-null    int64 
 3   created_at       685 non-null    object
 4   rating           685 non-null    object
dtypes: int64(2), object(3)
memory usage: 26.9+ KB


https://stackoverflow.com/questions/61608057/output-vader-sentiment-scores-in-columns-based-on-dataframe-rows-of-tweets

In [107]:
pdf2

Unnamed: 0,text,favourite_count,retweet_count,created_at,rating
0,The #Nanaimo 7-10 Club is running a cooling ce...,0,0,2021-06-25,"{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp..."
1,@LegendaryEnergy Barrel aged puna coffee from ...,0,0,2021-06-25,"{'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp..."
2,Back on the saddle with a big tall glass of co...,2,0,2021-06-25,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,"RT @cathsherman: #Bahamas Fishing Pier, Great ...",0,31,2021-06-25,"{'neg': 0.0, 'neu': 0.773, 'pos': 0.227, 'comp..."
4,@SuperWeenieHtJr You in Rhode Island? You need...,1,0,2021-06-25,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
...,...,...,...,...,...
680,RT @TheW0lfpup: Coffee island,0,20,2021-06-19,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
681,@Merkabagaming Man before we got it we’d go to...,1,0,2021-06-19,"{'neg': 0.0, 'neu': 0.797, 'pos': 0.203, 'comp..."
682,RT @CatherineSher10: Dauphin Island Sunset\nCl...,0,62,2021-06-19,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
683,RT @CatherineSher10: A statue of Hawaiian King...,0,28,2021-06-19,"{'neg': 0.0, 'neu': 0.83, 'pos': 0.17, 'compou..."


In [108]:
pdf2['negative_nltk']=[i['neg'] for i in pdf2.rating]
pdf2['positive_nltk']=[i['pos'] for i in pdf2.rating]
pdf2['neutral_nltk']=[i['neu'] for i in pdf2.rating]
pdf2['compound_nltk']=[i['compound'] for i in pdf2.rating]

pdf2.head(2)

Unnamed: 0,text,favourite_count,retweet_count,created_at,rating,negative_nltk,positive_nltk,neutral_nltk,ncompound_nltk
0,The #Nanaimo 7-10 Club is running a cooling ce...,0,0,2021-06-25,"{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp...",0.0,0.223,0.777,0.8172
1,@LegendaryEnergy Barrel aged puna coffee from ...,0,0,2021-06-25,"{'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp...",0.0,0.231,0.769,0.6369


In [109]:
pdf2['negative_nltk'] = pdf2['rating'].apply(lambda x : x['neg'])
pdf2['positive_nltk'] = pdf2['rating'].apply(lambda x : x['pos'])
pdf2['neutral_nltk'] = pdf2['rating'].apply(lambda x : x['neu'])
pdf2['compound_nltk'] = pdf2['rating'].apply(lambda x : x['compound'])

pdf2 = pdf2.drop('rating', axis=1)
pdf2.head(2)

Unnamed: 0,text,favourite_count,retweet_count,created_at,negative_nltk,positive_nltk,neutral_nltk,ncompound_nltk,compound_nltk
0,The #Nanaimo 7-10 Club is running a cooling ce...,0,0,2021-06-25,0.0,0.223,0.777,0.8172,0.8172
1,@LegendaryEnergy Barrel aged puna coffee from ...,0,0,2021-06-25,0.0,0.231,0.769,0.6369,0.6369


In [13]:
# Create the dataframe
# df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["letter", "name"])

# Function to get rows at `rownums`
# def getrows(df, rownums=None):
    # return df.rdd.zipWithIndex().filter(lambda x: x[1] in rownums).map(lambda x: x[0])

# Get rows at positions 0 and 2.
# getrows(df, rownums=[0, 2]).collect()

show last's row date

In [113]:
pdf2['created_at'][pdf.shape[0]-1]

datetime.date(2021, 6, 19)

In [114]:
pdf2['text'].count()

685

In [115]:
pdf2.count()

text               685
favourite_count    685
retweet_count      685
created_at         685
negative_nltk      685
positive_nltk      685
neutral_nltk       685
ncompound_nltk     685
compound_nltk      685
dtype: int64

In [116]:
pdf2.count(axis=0)

text               685
favourite_count    685
retweet_count      685
created_at         685
negative_nltk      685
positive_nltk      685
neutral_nltk       685
ncompound_nltk     685
compound_nltk      685
dtype: int64

In [117]:
pdf2.shape[0]

685

In [118]:
len(pdf.index)

685

---

---
## Pandas groupBy and aggregate functions 

`GroupBy` allows you to group rows together based off some column value, for example, you could group together sales data by the day the sale occured, or group repeast customer data based off the name of the customer.

Once you've performed the GroupBy operation you can use an aggregate function off that data.An `aggregate function` aggregates multiple rows of data into a single output, such as taking the sum of inputs, or counting the number of inputs.

**`Dataframe Aggregation`**

A set of methods for aggregations on a DataFrame:

    agg
    avg
    count
    max
    mean
    min
    pivot
    sum

### Rename a column

In [88]:
# Change name of a specific column
# pdf = pdf.rename(columns={'ncompound_nltk':'compound_nltk'})
# pdf.head(2)

Unnamed: 0,text,favourite_count,retweet_count,created_at,negative_nltk,positive_nltk,neutral_nltk,compound_nltk
0,@breakfasttv I received the Astrazeneca vaccin...,0,0,2021-06-20 23:59:37,0.049,0.199,0.752,0.7793
1,#Pfizer\n#AstraZeneca\n#Moderna\n#JohnsonAndJo...,3,5,2021-06-20 23:59:33,0.0,0.397,0.603,0.7639


In [121]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   text             685 non-null    object        
 1   favourite_count  685 non-null    int64         
 2   retweet_count    685 non-null    int64         
 3   created_at       685 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 21.5+ KB


### to_datetime

When a csv file is imported and a Data Frame is made, the Date time objects in the file are read as a string object rather a Date Time object and Hence it’s very tough to perform operations like Time difference on a string rather a Date Time object. Pandas to_datetime() method helps to convert string Date time into Python Date time object.

In [122]:
pdf['created_at'] = pd.to_datetime(pdf['created_at'])


In [123]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   text             685 non-null    object        
 1   favourite_count  685 non-null    int64         
 2   retweet_count    685 non-null    int64         
 3   created_at       685 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 21.5+ KB


In [124]:
pdf.head(2)

Unnamed: 0,text,favourite_count,retweet_count,created_at
0,The #Nanaimo 7-10 Club is running a cooling ce...,0,0,2021-06-25 23:37:24
1,@LegendaryEnergy Barrel aged puna coffee from ...,0,0,2021-06-25 22:49:10


In [205]:
pdf['created_at'] = pdf['created_at'].dt.strftime('%Y-%m-%d')

In [206]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6510 entries, 0 to 6509
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   text             6510 non-null   object 
 1   favourite_count  6510 non-null   int64  
 2   retweet_count    6510 non-null   int64  
 3   created_at       6510 non-null   object 
 4   negative_nltk    6510 non-null   float64
 5   positive_nltk    6510 non-null   float64
 6   neutral_nltk     6510 non-null   float64
 7   compound_nltk    6510 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 407.0+ KB


In [125]:
# For a DataFrame, by default the aggregates return results within each column:
# pdf['created_at'].dt.date => Converts Datetime to Date in Pandas df
pdf_agg_byDate =  pdf2.groupby('created_at').agg({'negative_nltk':'sum','positive_nltk':'sum','neutral_nltk':'sum','compound_nltk':'sum', 'created_at':'size'})

pdf_agg_byDate.count()

negative_nltk    7
positive_nltk    7
neutral_nltk     7
compound_nltk    7
created_at       7
dtype: int64

In [126]:
pdf_agg_byDate

Unnamed: 0_level_0,negative_nltk,positive_nltk,neutral_nltk,compound_nltk,created_at
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-06-19,0.358,11.796,81.846,37.3705,94
2021-06-20,1.202,7.17,74.628,18.9599,83
2021-06-21,2.616,9.877,66.506,23.0233,79
2021-06-22,14.005,8.949,160.045,-28.7149,183
2021-06-23,1.566,6.605,43.828,15.6423,52
2021-06-24,1.369,7.399,71.231,22.1919,80
2021-06-25,0.409,7.851,105.74,25.1654,114


In [127]:
pdf_agg_byDate =  pdf2.groupby('created_at').agg(negative_nltk=('negative_nltk','sum'),positive_nltk=('positive_nltk','sum'),neutral_nltk=('neutral_nltk','sum'),compound_nltk=('compound_nltk','sum'), tweets = ('created_at','size'))

pdf_agg_byDate.reset_index(level=0, inplace=True)
pdf_agg_byDate.count()

created_at       7
negative_nltk    7
positive_nltk    7
neutral_nltk     7
compound_nltk    7
tweets           7
dtype: int64

In [128]:
# row 0
type(pdf.created_at[0])

pandas._libs.tslibs.timestamps.Timestamp

In [129]:
# row 5
type(pdf.created_at[5])

pandas._libs.tslibs.timestamps.Timestamp

In [130]:
pdf_agg_byDate

Unnamed: 0,created_at,negative_nltk,positive_nltk,neutral_nltk,compound_nltk,tweets
0,2021-06-19,0.358,11.796,81.846,37.3705,94
1,2021-06-20,1.202,7.17,74.628,18.9599,83
2,2021-06-21,2.616,9.877,66.506,23.0233,79
3,2021-06-22,14.005,8.949,160.045,-28.7149,183
4,2021-06-23,1.566,6.605,43.828,15.6423,52
5,2021-06-24,1.369,7.399,71.231,22.1919,80
6,2021-06-25,0.409,7.851,105.74,25.1654,114


In [131]:
pdf_agg_byDate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   created_at     7 non-null      object 
 1   negative_nltk  7 non-null      float64
 2   positive_nltk  7 non-null      float64
 3   neutral_nltk   7 non-null      float64
 4   compound_nltk  7 non-null      float64
 5   tweets         7 non-null      int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 464.0+ bytes


### Divide multiple columns by another column in pandas

In [132]:

# pdf_agg_byDate = (pdf_agg_byDate.apply(['compound_nltk']/pdf_agg_byDate['tweets']
#     .withColumn( 'positive_nltk',f.col('sum(positive_nltk)')/f.col('count(created_at)'))
#     .withColumn( 'negativen_ltk',f.col('sum(negative_nltk)')/f.col('count(created_at)'))
#     .withColumn( 'neutral_nltk',f.col('sum(neutral_nltk)')/f.col('count(created_at)'))
#     .withColumnRenamed('count(created_at)', 'tweets')).drop(*columns_to_drop

pdf_agg_byDate[['negative_nltk','positive_nltk','neutral_nltk','compound_nltk']]=\
    (pdf_agg_byDate[['negative_nltk','positive_nltk','neutral_nltk','compound_nltk']].divide(pdf_agg_byDate ['tweets'], axis = 'index'))


pdf_agg_byDate


Unnamed: 0,created_at,negative_nltk,positive_nltk,neutral_nltk,compound_nltk,tweets
0,2021-06-19,0.003809,0.125489,0.870702,0.397559,94
1,2021-06-20,0.014482,0.086386,0.899133,0.228433,83
2,2021-06-21,0.033114,0.125025,0.841848,0.291434,79
3,2021-06-22,0.07653,0.048902,0.874563,-0.156912,183
4,2021-06-23,0.030115,0.127019,0.842846,0.300813,52
5,2021-06-24,0.017112,0.092488,0.890388,0.277399,80
6,2021-06-25,0.003588,0.068868,0.927544,0.220749,114


In [133]:
pdf_agg_byDate['sentiment'] = (pdf_agg_byDate['compound_nltk']
        .apply(lambda comp: 'positive' if comp > 0.05 else 'negative' if comp < -0.05 else 'neutral'))

In [135]:
pdf_agg_byDate

Unnamed: 0,created_at,negative_nltk,positive_nltk,neutral_nltk,compound_nltk,tweets,sentiment
0,2021-06-19,0.003809,0.125489,0.870702,0.397559,94,positive
1,2021-06-20,0.014482,0.086386,0.899133,0.228433,83,positive
2,2021-06-21,0.033114,0.125025,0.841848,0.291434,79,positive
3,2021-06-22,0.07653,0.048902,0.874563,-0.156912,183,negative
4,2021-06-23,0.030115,0.127019,0.842846,0.300813,52,positive
5,2021-06-24,0.017112,0.092488,0.890388,0.277399,80,positive
6,2021-06-25,0.003588,0.068868,0.927544,0.220749,114,positive


# perfplot

In [68]:
import numpy as np
import pandas as pd
import perfplot

perfplot.save(
    "out.png",
    setup=lambda n: pd.DataFrame(np.arange(n * 3).reshape(n, 3)),
    n_range=[2**k for k in range(25)],
    kernels=[
        lambda df: len(df.index),
        lambda df: df.shape[0],
        lambda df: df[df.columns[0]].count(),
    ],
    labels=["len(df.index)", "df.shape[0]", "df[df.columns[0]].count()"],
    xlabel="Number of rows",
)

Output()