In [9]:
import os
import tweepy as tw
import pandas as pd
import regex as re
import config as config

In [5]:
# for later use
def remove_url(txt):
    """Replace URLs found in a text string with nothing 
    (i.e. it will remove the URL from the string).

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove urls.

    Returns
    -------
    The same txt string with url's removed.
    """
#     print(re.search("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", txt))
    
    print(re.search("(?P<url>https?://[^\s]+)", txt).group("url"))

    return " ".join(re.sub("(?P<url>https?://[^\s]+)", "", txt).split())

In [6]:
# Testing successful url removal 
print(remove_url('Coronavirus update: U.S. cases reach 2.9 million and death toll tops 130K, as 38 states see cases on the climb https://t.co/waZ1k6SfJK'))

https://t.co/waZ1k6SfJK
Coronavirus update: U.S. cases reach 2.9 million and death toll tops 130K, as 38 states see cases on the climb


In [10]:
api = config.create_api()

In [11]:
# Extract Data for a time range and particular handle
from datetime import date

def get_data(date_since="", search_words="", user_name="marketwatch", user_id="624413",total_items=200):
    """
    default screen_name is marketwatch or we want more of this thing to come into play

    """
    #if date is empty take today's date
    if date_since == "":
        today = date.today()
        date_since = today.strftime("%Y-%m-%d") # format the date to "2020-07-06"
    if search_words =="": # if not search word is given search based on screen name
        # Collect tweets
        tweets = tw.Cursor(
            api.user_timeline,
            screen_name=user_name,
            id=user_id,
            lang="en",
            since=date_since).items(total_items)
    else:
        tweets = tw.Cursor(
            api.search,
            q=search_words,
            lang="en",
            since=date_since).items(total_items)

    users_locs = [[tweet.id, tweet.created_at,tweet.user.screen_name, tweet.text] for tweet in tweets]
    users_locs

    pd.set_option('display.max_colwidth', -1)
    tweet_df2 = pd.DataFrame(data=users_locs, 
                        columns=['id','date','user',"tweets"])
    return tweet_df2


### Tweets by the handle or specific keywords (can't use keywords search with a user in twitter api :( )
---


In [32]:
#get tweets by a specific search words)

def getDataBySeach(date_since,list_of_search, filename):
    tweet_df = get_data(date_since=date_since,search_words=list_of_search, total_items=1000)
    
    tweet_df.to_csv(filename, mode="a+", header=False, index=False)
    return tweet_df

search_words=["Tesla","Elon Musk"]
filename="Data/tweetsbysearch.csv"
date_since="2020-01-01"
tweet_df = getDataBySeach(date_since, search_words,filename)
tweet_df

Unnamed: 0,id,date,user,tweets
0,1283137923991257088,2020-07-14 20:34:55,joestaiano,"RT @RudyHavenstein: BREAKING: Tesla shares jump after announcement by CEO Elon Musk of new ""Self-Walking Dog"" coming to market in Q1 2021 -…"
1,1283137876784549888,2020-07-14 20:34:44,investwithBSF,"RT @RudyHavenstein: BREAKING: Tesla shares jump after announcement by CEO Elon Musk of new ""Self-Walking Dog"" coming to market in Q1 2021 -…"
2,1283137315922116613,2020-07-14 20:32:30,biofourmis,"RT @maria_axente: During today's World Artificial Intelligence Conference, @AlibabaGroup founder #JackMa said the pandemic has accelerated…"
3,1283137246279999496,2020-07-14 20:32:13,dayarvsvau,"RT @Reuters: Tesla is ‘very close’ to achieving level 5 autonomous driving technology, Chief Executive Elon Musk said, referring to the cap…"
4,1283137171076128768,2020-07-14 20:31:55,jwoodiwiss,"RT @RudyHavenstein: BREAKING: Tesla shares jump after announcement by CEO Elon Musk of new ""Self-Walking Dog"" coming to market in Q1 2021 -…"
5,1283137136359690245,2020-07-14 20:31:47,tamyrausa,RT @GhostWick14: This has never added up for me. \n\nUnless......🤔\n\nTesla CEO Elon Musk now personally worth more than combined value of For…
6,1283137033985232899,2020-07-14 20:31:23,diablo4x9,RT @lorakolodny: Last thought today about FSD / Autopilot &amp; Tesla. Where's that cross country demo?! Tesla has been promising self-driving…
7,1283137007062069248,2020-07-14 20:31:16,tymcmahan,Good points from @seankouplen about how Oklahoma building a relationship with @Tesla creates credibility with other… https://t.co/3iqH0iLBjR
8,1283136748344745985,2020-07-14 20:30:14,0Deflation,"RT @RudyHavenstein: BREAKING: Tesla shares jump after announcement by CEO Elon Musk of new ""Self-Walking Dog"" coming to market in Q1 2021 -…"
9,1283136647274672129,2020-07-14 20:29:50,lanksinatra_,RT @brendohare: Tesla is sad to announce the passing of our CEO Elon Musk. Someone said the number “420” and Elon began laughing so violent…


In [28]:
#loop for getting data from specific twitter handles

def getDataFromHandle(list_of_details, filename):
    """ Loop through a list of tweets of a handle"""
    
    for search in list_of_details:
        date_since = search['date_since']
        user_name = search['user_name']
        user_id = search['user_id']
        tweet_df = get_data(date_since=date_since,user_name=user_name,user_id=user_name, total_items=2000)
        tweet_df.to_csv(filename, mode="a+", header=False, index=False)
        
# @TODO: "to not max out the request add a sleep timer if you wish"
#         sleep(1000)
    return tweet_df

    

In [26]:
# read the file with twitter ids lookup table

def read_twitter_source():    
    twitter_id_df = pd.read_csv('Data/twitter_id.csv', header=None)
    twitter_id_df.columns = ["screen_name","id"]
    return twitter_id_df

twitter_id_df = read_twitter_source()
twitter_id_df.head()

user_name='transcriptdaily'
user_id = twitter_id_df[twitter_id_df['screen_name']==user_name]['id']

user_id


34    852932000000000000
Name: id, dtype: int64

In [29]:
#
tweetsfrom = {
    "date_since" : "2020-07-10",
    "user_name" : user_name,
    "user_id" : user_id
}

filename = "Data/transcriptdaily_3.csv"

list_of_details = []
list_of_details.append(tweetsfrom)
tweet_df = getDataFromHandle(list_of_details, filename)
tweet_df