# Gathering Tweets from Twitter

This notebook is for gathering tweets from Twitter about a specific word query and date. All data gathered cleaned with the help of NLP techniques and in the end dataframe converted into csv format.

In [None]:
!pip install tweepy
!pip install nltk

In [None]:
from tqdm import tqdm, notebook
import tweepy as tw
import textblob
import os

import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import RegexpTokenizer

import datetime
import pandas as pd
import numpy as np

nltk.download('wordnet')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

Twitter API authentication

In [None]:
api_key = ""
api_secret_key = ""
access_token = ""
access_token_secret = ""
Bearer_token: ""

auth = tw.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

# Tweets query



Searching and obtaining tweets with a specific query

In [None]:
# query attributes
search_words = "#covid19 -filter:retweets"
date_since = "2021-01-01"

# Collect tweets
tweets = tw.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(1000)

In [None]:
tweets_copy = []
for tweet in tqdm(tweets):
     tweets_copy.append(tweet)
     
print(f"new tweets retrieved: {len(tweets_copy)}")

1000it [00:35, 28.31it/s]

new tweets retrieved: 1000





Converting the dataset into pandas dataframe format

In [None]:
tweets_df = pd.DataFrame()
for tweet in tqdm(tweets_copy):
    hashtags = []
    try:
        for hashtag in tweet.entities["hashtags"]:
            hashtags.append(hashtag["text"])
        text = api.get_status(id=tweet.id, tweet_mode='extended').full_text
    except:
        pass
    tweets_df = tweets_df.append(pd.DataFrame({'user_name': tweet.user.name, 
                                               'user_location': tweet.user.location,\
                                               'user_description': tweet.user.description,
                                               'user_created': tweet.user.created_at,
                                               'user_followers': tweet.user.followers_count,
                                               'user_friends': tweet.user.friends_count,
                                               'user_favourites': tweet.user.favourites_count,
                                               'user_verified': tweet.user.verified,
                                               'date': tweet.created_at,
                                               'text': text, 
                                               'hashtags': [hashtags if hashtags else None],
                                               'source': tweet.source,
                                               'is_retweet': tweet.retweeted}, index=[0]))
    
    
tweets_df.head()

Clear the duplicates if there are any

In [None]:
tweets_df.drop_duplicates(subset = ["user_name", "date", "text"], inplace=True)
print(f"all tweets: {tweets_df.shape}")

all tweets: (1000, 13)


# Data Preprocessing

In [None]:
tweets_df=tweets_df[['text']]
tweets_df['text']=tweets_df['text'].str.lower() # all tweets cconverted to the lowercase for working better
print(tweets_df.head())

In [None]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
tweets_df['text']=tweets_df['text'].apply(lambda x: cleaning_punctuations(x))
tweets_df['text'].head()

In [None]:
def cleaning_email(data):
    return re.sub('@[^\s]+', ' ', data)

tweets_df['text']= tweets_df['text'].apply(lambda x: cleaning_email(x))

def cleaning_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',data)

tweets_df['text'] = tweets_df['text'].apply(lambda x: cleaning_URLs(x))


def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

tweets_df['text'] = tweets_df['text'].apply(lambda x: cleaning_numbers(x))
tweets_df['text'].head()

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
tweets_df['text'] = tweets_df['text'].apply(lambda text: cleaning_stopwords(text))
tweets_df['text'].head()

In [None]:
# Stemming
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data

tweets_df['text']= tweets_df['text'].apply(lambda x: stemming_on_text(x))

# Applying Lemmatizer
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data

tweets_df['text'] = tweets_df['text'].apply(lambda x: lemmatizer_on_text(x))
tweets_df['text'].head()

Export the data

In [None]:
tweets_df.to_csv("test.csv", index=False)