In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import tweepy
import spacy
import csv
import json
import pandas as pd
import os

In [2]:
# Twitter API credentials
credentialsPath = r'..\0_data\credentials'
with open(os.path.join(credentialsPath, 'twitter_credentials.json')) as cred_data:
    info = json.load(cred_data)
    consumer_key = info['CONSUMER_KEY']
    consumer_secret = info['CONSUMER_SECRET']
    access_key = info['ACCESS_KEY']
    access_secret = info['ACCESS_SECRET']

# Create the api endpoint

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth)

In [4]:
# Convert to data frame
def toDataFrame(tweets):
    DataSet = pd.DataFrame()
    
    tweetsText = []
    for tweet in tweets:
            if 'retweeted_status' in  dir(tweet):
                tweetsText.append(tweet.retweeted_status.full_text.encode('utf-8'))                
            else:
                tweetsText.append(tweet.full_text.encode('utf-8'))
            
    DataSet['Text'] = [text for text in tweetsText]
    DataSet['User'] = [tweet.user.name.encode('utf-8') for tweet in tweets]
    
    tweetsImages = []
    for tweet in tweets:
        if 'media' in tweet.entities:
            for image in tweet.entities['media']:
                tweetsImages.append(image['media_url'])
        else:
            tweetsImages.append('')
            
    
    DataSet['Image Urls'] = [image for image in tweetsImages]
            
    tweetsLongitudes = []
    for tweet in tweets:
        if tweet.coordinates is not None:
            tweetsLongitudes.append(tweet.coordinates["coordinates"][0])
        else:
            tweetsLongitudes.append('')
    DataSet['Longitude'] = [longitude for longitude in tweetsLongitudes]
    
    tweetsLatitudes = []
    for tweet in tweets:
        if tweet.coordinates is not None:
            tweetsLatitudes.append(tweet.coordinates["coordinates"][1])
        else:
            tweetsLatitudes.append('')
    DataSet['Latitude'] = [latitude for latitude in tweetsLatitudes]
    
    return DataSet


# Specify the maximum number of tweets that you want to be extracted.

maximum_number_of_tweets_to_be_extracted = \
    int(input('Enter the number of tweets that you want to extract- '))

# Mention the mention that you want to look out for

tag_choice = input('Enter which you are searching for, a hashtag (#), or a mention (@)')

# added .lower() to ensure consistency
mention = input('Enter the term you want to scrape- ').lower()

results = []

for tweet_info in tweepy.Cursor(api.search, q=str(tag_choice) + mention,
                           tweet_mode='extended').items(maximum_number_of_tweets_to_be_extracted):
    results.append(tweet_info)
    
for tweet_info in tweepy.Cursor(api.search, q=str(tag_choice) + mention.upper(),
                           tweet_mode='extended').items(maximum_number_of_tweets_to_be_extracted):
    results.append(tweet_info)

data = toDataFrame(results)
outputPath = r'..\0_data\manual'
filePath = os.path.join(outputPath,'tweets_with_mention_' + mention + '.csv')
if not os.path.isfile(filePath):
    data.to_csv(filePath, index=False)
else:
    with open(filePath, 'a') as file:
        data.to_csv(file, index = False)
print ('Extracted ' + str(maximum_number_of_tweets_to_be_extracted) \
    + ' tweets with ' + str(tag_choice) + mention)

Enter the number of tweets that you want to extract- 25
Enter which you are searching for, a hashtag (#), or a mention (@)#
Enter the term you want to scrape- cali
Extracted 25 tweets with #cali


In [5]:
nlp = spacy.load('en')

nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x11fff690>),
 ('parser', <spacy.pipeline.DependencyParser at 0x5ed11e0>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x12008f00>)]

In [21]:
clean_texts = []
clean_users = []

for row in data.itertuples():
    text = row[1].decode('utf-8')
    user = row[2].decode('utf-8')
    
    text_doc = nlp.make_doc(text)
    tokens = [token for token in text_doc if not token.is_stop]
    tokens = [token for token in tokens if not token.is_punct]
    tokens = [token for token in tokens if token.text != ' ']    
    tokenz_final = [token.text for token in tokens]
    
    text = " ".join(tokenz_final).strip()
    text = text.replace('\n', '')
    
    clean_texts.append(text)
    clean_users.append(user)

In [29]:
clean_data = pd.DataFrame({'clean_texts':clean_texts, 'clean_users':clean_users})

In [30]:
clean_data.head()

Unnamed: 0,clean_texts,clean_users
0,MCCWear ’s Signature Adult Size Rainbow Shaggy...,🧶🧡Queen Millz🧡🧶
1,MCCWear ’s Signature Adult Size Rainbow Shaggy...,#〽️CCWear 🧶💜
2,About westcoast educator musicschool JesHudak ...,ETStudioProductions
3,Tu opinión es muy importante para mejorar nues...,Taxis Libres Los 4
4,Haganle pa delante trabajen decentemente mensa...,Santiago Ballesteros
