In [40]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from tweepy import OAuthHandler
from tweepy import Cursor
from pprint import pprint

In [3]:
#create an app on https://developer.twitter.com/en/apply-for-access to generate the tokens

import tweepy
consumer_key = "<CHANGE_ME>"
consumer_secret = "<CHANGE_ME>"
access_token = "<CHANGE_ME>"
access_token_secret = "<CHANGE_ME>"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

In [55]:
from datetime import date, timedelta
from dateutil.rrule import rrule, DAILY
from tqdm import tqdm

def extract_tweets(text):
    df = pd.DataFrame()
    print("Extract tweets containting text: ",text)
    for i in tqdm(range(2)):
        day = date.today() + timedelta(days=i*-1)        
        tweets = api.search_tweets(text, tweet_mode="extended",lang="en",count=500,until = day) 
        #to do: check how many attributes can we get from tweet objects
        data_tweets = [
            [
                tweet.created_at, 
                tweet.full_text, 
                tweet.retweet_count,
                tweet.favorite_count,
                tweet.user.screen_name,
                tweet.user.followers_count,
                tweet.user.favourites_count,
                tweet.place,
                tweet.entities.get("hashtag"),
                tweet.entities.get("user_mentions"),
                tweet.retweeted,
                tweet.lang
            ] 
            for tweet in tweets]

        tweet_text_df = pd.DataFrame(data=data_tweets,
                                     columns=[
                                        "created_at",
                                        "full_text",
                                        "retweet_count",
                                        "likes",
                                        "user_screen_name",
                                        "user_follower_count",
                                        "user_favourites_count",
                                        "tweet_place",
                                        "tweet_hashtags",
                                        "tweets_user_mentions",
                                        "tweet_retweeted_by_auth_user",
                                        "tweet_lang"
                                        ]
                                    )        

        df = pd.concat([df, tweet_text_df],ignore_index=True)        
        
    return df

In [56]:
df = extract_tweets("@StarbucksIndia OR @Starbucks OR @StarbucksUK OR @frappuccino OR @StarbucksNews OR @StarbucksCanada OR @StarbucksMY OR #StarbucksRewards OR #Starbucks OR #Starbuckscoffee OR #Starbuckscoffe OR #starbucksreserve OR #RedCupDay OR #StarbucksNews OR #StarbucksMalaysia")


Extract tweets containting text:  @StarbucksIndia OR @Starbucks OR @StarbucksUK OR @frappuccino OR @StarbucksNews OR @StarbucksCanada OR @StarbucksMY OR #StarbucksRewards OR #Starbucks OR #Starbuckscoffee OR #Starbuckscoffe OR #starbucksreserve OR #RedCupDay OR #StarbucksNews OR #StarbucksMalaysia


100%|██████████| 2/2 [00:02<00:00,  1.18s/it]


In [None]:
df.shape

(165, 6)

In [None]:
#df.to_csv("DM_starbucks2.csv",index = False)

### Columns of Interest
- full_text - To extract the sentiment of the complaint
- created_at - Extracting weekday or weekend may give better insight on nature of review

In [None]:
# number of missing values in each variable.
df.isna().sum()

created_at       0
full_text        0
retweet_count    0
Likes            0
Language         0
cleaned_text     0
dtype: int64

In [None]:
# removing duplicate tweets

df.duplicated(subset='full_text', keep= 'first').sum()
print('Number of duplicate tweets :',df.duplicated(subset='full_text', keep= 'first').sum())

df = df.drop_duplicates(subset='full_text', keep= 'first')
print('Number of tweets after deleting duplicate tweets :',df.full_text.count())

Number of duplicate tweets : 0
Number of tweets after deleting duplicate tweets : 165


### Detecting the language of the tweets:

In [None]:
from langdetect import detect
df['Language'] = df.full_text.apply(lambda text: detect(text))

# Getting the count of Language
df['Language'].value_counts() 

en    163
id      1
so      1
Name: Language, dtype: int64

### Extracting the tweets that are in English:

In [None]:
df = df[df['Language']=="en"]
df.shape

(163, 6)

### Cleaning and Pre-Processing Dataset

In [None]:
#removing new lines and tabs
def remove_newlines_tabs(text):
    Formatted_text = text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').\
    replace('\\', ' ').replace('. com', '.com')
    return Formatted_text
df['cleaned_text']=df['full_text'].apply(lambda text:remove_newlines_tabs(text))

In [None]:
# removing html tags
from bs4 import BeautifulSoup as bs

def rev_html_Tag(text):
    soup = bs(text, "html.parser")
    new_text=soup.get_text(separator=" ")
    return new_text
df['cleaned_text']=df['cleaned_text'].apply(lambda text:rev_html_Tag(text))

In [None]:
# removing any whitespace
import re

def rev_whitespace(text):
    pattern=re.compile(r'\s+')
    text_new=re.sub(pattern,' ',text)
    return text_new

df['cleaned_text']=df['cleaned_text'].apply(lambda text:rev_whitespace(text))

In [None]:
#removing addiotional accented characters from text
import unidecode
def rev_asc(text):
    text=unidecode.unidecode(text)
    return text
df['cleaned_text']=df['cleaned_text'].apply(lambda text:rev_asc(text))

In [None]:
#removing any links
def rev_link(text):
    rev_link=re.sub(r'http\S+','',text)
    rev_com=re.sub(r'\[A-Za-z]*\.com','',rev_link)
    return rev_com
df['cleaned_text']=df['cleaned_text'].apply(lambda text:rev_link(text))

In [None]:
#removing repeated charactor
def rev_rep(text):
    Pattern_alpha = re.compile(r"([A-Za-z])\1{1,}", re.DOTALL)
    Formatted_text = Pattern_alpha.sub(r"\1\1", text)
    Pattern_Punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
    Combined_Formatted = Pattern_Punct.sub(r'\1', Formatted_text)
    Final_Formatted = re.sub(' {2,}',' ', Combined_Formatted)
    return Final_Formatted
df['cleaned_text']=df['cleaned_text'].apply(lambda text:rev_rep(text))

In [None]:
#removing puncutations
def rev_puc(text):
    punc='''!"#$%&'()*+,-/:;<=>?@[\]^_`{|}~'''
    Formatted_text=re.sub(r"[^a-zA-Z0-9:$-,%.?!]+",' ',text)
    Formatted_text2=re.sub(r"[()]+",' ',Formatted_text)
    return Formatted_text2
df['cleaned_text']=df['cleaned_text'].apply(lambda text:rev_puc(text))

In [None]:
# Replacing contractions with their expanded 
import contractions
df['cleaned_text'] = df.cleaned_text.apply(lambda text: contractions.fix(text))

In [None]:
# Converting to lower case
df['cleaned_text'] = df.cleaned_text.apply(lambda text: text.lower())

In [None]:
df.full_text[3]

"RT @speaker_aman: Lulu Mall Lucknow \nIndia's Biggest Mall\n#lulumall #india #Lucknow #biggest #hypermarket #foodcourt #youtube #instagram #f…"

In [None]:
df.cleaned_text[3]

"rt speaker aman: lulu mall lucknow india's biggest mall lulumall india lucknow biggest hypermarket foodcourt youtube instagram f."

### Tokenizing and Visualizing the tweets
We want to see what are the tweets mostly about
A wordcloud helps visualize the same better

In [None]:
# load nltk's English stopwords as variable called 'stopwords'
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Saumaya
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['a','u','p','b','w','s','go','c','an', 'the', 'to', 'for','also','wold','rt',"'s",'us'])

def tokenize_only(text):
    # Tokenizing each sentence and then word
    tokens = [word.lower() 
              for sent in nltk.sent_tokenize(text) 
              for word in nltk.word_tokenize(sent) 
              if word.lower() not in stopwords]
    
    filtered_tokens = []
    
    # Filtering out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
tokenize_text= []
for i in range(len(df)):
    tokenize_text.extend(tokenize_only(df['cleaned_text'].iloc[i]))


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Saumaya Jain/nltk_data'
    - 'C:\\Users\\Saumaya Jain\\anaconda\\nltk_data'
    - 'C:\\Users\\Saumaya Jain\\anaconda\\share\\nltk_data'
    - 'C:\\Users\\Saumaya Jain\\anaconda\\lib\\nltk_data'
    - 'C:\\Users\\Saumaya Jain\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_word_cloud(final_data, title):
    wordcloud = WordCloud(width=1600, height=800, max_font_size=200, stopwords = stopwords,
                          background_color='white').generate(final_data)
    
    # plt the image generated by WordCloud class
    plt.figure(figsize=(12,10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title+"\n", fontsize = 16)
    plt.show()

create_word_cloud(' '.join(tokenize_text),"Optum Word Cloud")

In [None]:
# Barplot of top 20 used words in all the tweets
import seaborn as sns
df_tokens = pd.DataFrame(tokenize_text).value_counts().rename_axis('tokens').reset_index(name='count')

plt.figure(figsize=(10,5))
sns.barplot(data=df_tokens.head(20), y='tokens',x='count', color='grey');

### Calculating Sentiment Scores using Vader

In [None]:
# define unit func to process one doc
from nltk import sent_tokenize, word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def vader_unit_func(doc0,column_name):
    sents_list0 = sent_tokenize(doc0)
    vs_doc0 = []
    sent_ind = []
    for i in range(len(sents_list0)):
        vs_sent0 = analyzer.polarity_scores(sents_list0[i])
        vs_doc0.append(vs_sent0)
        sent_ind.append(i)
        
    # obtain output as DF    
    doc0_df = pd.DataFrame(vs_doc0)
    doc0_df.columns = [x+column_name for x in doc0_df.columns]
    doc0_df.insert(0, 'sent_index', sent_ind)  # insert sent index
    doc0_df.insert(doc0_df.shape[1], 'sentence', sents_list0)
    return(doc0_df)

# define wrapper func
def vader_wrap_func(corpus0,column_name):
    
    # use ifinstance() to check & convert input to DF
    if isinstance(corpus0, list):
        corpus0 = pd.DataFrame({'text':corpus0})
    
    # define empty DF to concat unit func output to
    vs_df = pd.DataFrame()    
    
    # apply unit-func to each doc & loop over all docs
    for i1 in range(len(corpus0)):
        doc0 = str(corpus0.iloc[i1])
        vs_doc_df = vader_unit_func(doc0,column_name)  # applying unit-func
        vs_doc_df.insert(0, 'doc_index', i1)  # inserting doc index
        vs_df = pd.concat([vs_df, vs_doc_df], axis=0)
        
    return(vs_df)

In [None]:
# Vader Sentiment scores
df['vader_score'] = vader_wrap_func(df.cleaned_text,"").groupby('doc_index')['compound'].sum()
df.head()

In [None]:
df['Isnegative'] = np.where(df['vader_score']<0,1,0)
df['Isnegative'].value_counts()

In [None]:
df_negative = df[df['Isnegative']==1]
df_negative.shape

In [None]:
tokenize_text= []
for i in range(len(df_negative)):
    tokenize_text.extend(tokenize_only(df_negative['cleaned_text'].iloc[i]))

create_word_cloud(' '.join(tokenize_text),"Optum Word Cloud for negative tweets")

In [None]:
# Barplot of top 20 used words in all the tweets
df_tokens = pd.DataFrame(tokenize_text).value_counts().rename_axis('tokens').reset_index(name='count')

plt.figure(figsize=(10,5))
sns.barplot(data=df_tokens.head(20), y='tokens',x='count', color='grey');

### bi-gram

In [None]:
# Bi-gram
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_df = 0.9,
                                  max_features=50,
                                  stop_words=stopwords,
                                  tokenizer=tokenize_only,
                                  strip_accents = 'unicode',
                                  ngram_range=(2,2),
                                 )

count_matrix = count_vectorizer.fit_transform(df_negative.cleaned_text)    
count_tokens = count_vectorizer.get_feature_names()

print(count_matrix.shape)  # Print the dimensions of the matrix

df_bigrams = pd.DataFrame(data = count_matrix.toarray(),columns = count_tokens)
df_bigrams.columns