### Data Collection and Preprocessing

In [None]:
import ssl

# Create secure context
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
import tweepy

# API configuration
app_api_key = 'API_KEY'
app_api_secret_key = 'API_SECRET'

auth = tweepy.AppAuthHandler(app_api_key, app_api_secret_key)
api = tweepy.API(auth,
                 wait_on_rate_limit=True,
                 retry_count=5, # number of attemps
                 retry_delay=180) # number of seconds to wait

In [None]:
import pandas as pd

# Pandas configuration
pd.set_option('display.max_colwidth', None)
pd.set_option('max_columns', None)

#### Data Collection

In [None]:
# Function to search tweets
def search_tweets(query, limit=100):
  """Get tweets from Twitter API based on the query."""
  tweets = tweepy.Cursor(api.search_tweets,
                         q=query,
                         tweet_mode='extended',
                         lang='en',
                         count=100).items(limit)
  retrieved_tweets = [tweet._json for tweet in tweets]
  df = pd.json_normalize(retrieved_tweets)
  return df

In [None]:
def search_30_day(query, limit=100):
  """Get tweets from the last 30 days."""
  tweets = tweepy.Cursor(api.search_30_day,
                         label='SentimentAnalysis',
                         query=query,
                         maxResults=100).items(limit)
  retrieved_tweets = [tweet._json for tweet in tweets]
  df = pd.json_normalize(retrieved_tweets)
  return df

In [None]:
import sqlite3
from pathlib import Path
from datetime import datetime

# Function to create a current timestamp
def timestamp():
  """Create current timestamp, e.g., 20221107_123045."""
  return datetime.now().strftime("%Y%m%d_%H%M%S")

# Function to save DataFrame to SQLite database
def save_sql(df, filename, action="replace"):
  """Save dataframe to SQLite. Available actions: replace, append."""
  db_name = Path(f'database/{filename}.db')
  db_name.parent.mkdir(parents=True, exist_ok=True)
  con = sqlite3.connect(db_name)
  df.to_sql(filename, con, index=False, if_exists=action)

# Function to load SQL table to DataFrame
def load_sql(db_name, tbl_name):
  """Load SQLite database."""
  con = sqlite3.connect(f'database/{db_name}.db')
  df = pd.read_sql(f"SELECT * FROM {tbl_name}", con)
  con.close()
  return df

# Function to save DataFrame into CSV file
def save_csv(df, filename):
  """Save dataframe into CSV file."""
  filepath = Path(f'datasets/{filename}_{timestamp()}.csv')
  filepath.parent.mkdir(parents=True, exist_ok=True)
  df.to_csv(filepath)

# Function to load DataFrame from CSV file
def load_csv(filename):
  """Load dataframe from CSV file."""
  filepath = Path(f'datasets/{filename}.csv')
  return pd.read_csv(filepath)

In [None]:
# Extract columns from the raw Twitter dataset
def extract_columns(df, method='tweets_search'):
  """Extract columns from the raw Twitter dataset."""
  
  if (method == 'tweets_search'):
    # Extract only what we need
    df = df[['id', 'created_at','full_text', 'user.location',
           'place.name', 'place.full_name', 'place.id', 'entities.hashtags']]
    
    # Standardise column names
    column_mapping = {'id': 'id', 'created_at': 'created_at',
                      'full_text': 'text', 'user.location': 'user_location',
                      'place.name': 'place_name', 'place.id': 'place_id',
                      'place.full_name': 'place_full_name', 'entities.hashtags': 'hashtags'}
    
  else:
    # Extract only what we need
    df = df[['id', 'created_at', 'text', 'user.location',
           'place.name', 'place.full_name', 'place.id', 'entities.hashtags']]
    
    # Standardise column names
    column_mapping = {'id': 'id', 'created_at': 'created_at',
                      'text': 'text', 'user.location': 'user_location',
                      'place.name': 'place_name', 'place.id': 'place_id',
                      'place.full_name': 'place_full_name', 'entities.hashtags': 'hashtags'}
    
  # Define remaining columns
  columns = [c for c in column_mapping.keys() if column_mapping[c] != None]
  
  # Select and rename these columns
  return df[columns].rename(columns=column_mapping)

In [None]:
place_id = '6416b8512febefc9' # United Kingdom place_id

queries = [
  # Milary actions
  f"('ukraine' OR #ukraine OR @ukraine OR '🇺🇦') AND (military OR war OR warfare OR unprovoked OR border OR escalation OR conflict OR invasion OR attack OR tension OR force OR battalion OR unprovoked OR invade OR power OR offensive) AND place:{place_id} -filter:retweets",
  # Military equipment
  f"('ukraine' OR #ukraine OR @ukraine OR '🇺🇦') AND (weapon OR javelin OR tank OR aircraft OR armour OR munition OR arms OR jet OR lethal OR equipment OR fuel OR rocket OR stringer OR patriot OR helmet OR rifle OR goggle OR vest OR grenade OR gun OR nuclear OR missile) AND place:{place_id} -filter:retweets",
  # Financial aid
  f"('ukraine' OR #ukraine OR @ukraine OR '🇺🇦') AND (financial OR money OR economy OR economic OR donation OR subsidy OR loan OR budget OR commit OR pledge OR dollars OR euros OR pounds OR billion OR million OR grant OR fund OR cost  OR finance OR bank OR investment OR donor) AND place:{place_id} -filter:retweets",
  # Meidical aid
  f"('ukraine' OR #ukraine OR @ukraine OR '🇺🇦') AND (medical OR supply OR supplies OR food OR emergency OR humanitarian OR medicine OR wounded OR victim OR hospital OR 'red cross' OR hygiene OR healthcare OR zeolite OR oxygen OR patient OR shipment OR doctor OR nurse) AND place:{place_id} -filter:retweets",
  # Misc.
  f"('ukraine' OR #ukraine OR @ukraine OR '🇺🇦') AND ('united kingdom' OR 'uk' OR '🇬🇧' OR england OR '🏴󠁧󠁢󠁥󠁮󠁧󠁿' OR scotland OR '🏴󠁧󠁢󠁳󠁣󠁴󠁿' OR wales OR '🏴󠁧󠁢󠁷󠁬󠁳󠁿' OR 'nothern ireland' OR 'boris johnson' OR 'liz truss' OR 'ben wallace' OR parliament) AND place:{place_id} -filter:retweets"]

In [None]:
search_30_day_queries = [
  # Military actions
  f"(military OR war OR warfare OR unprovoked OR border OR escalation OR conflict OR invasion OR attack OR tension OR force OR battalion OR invade OR power OR offensive) (ukraine OR #ukraine OR @ukraine OR 🇺🇦) place_country:GB",
  # Military equipment
  f'(ukraine OR #ukraine OR @ukraine OR 🇺🇦) (weapon OR javelin OR tank OR aircraft OR armour OR munition OR arms OR jet OR lethal OR equipment OR fuel OR rocket OR stringer OR rifle OR goggle OR vest OR grenade OR gun OR nuclear OR missle) place_country:GB',
  # Financial aid
  f'(ukraine OR #ukraine OR @ukraine OR 🇺🇦) (financial OR money OR economy OR economic OR donation OR subsidy OR loan OR budget OR commit OR billion OR million OR grant OR fund OR cost OR finance OR bank OR investment OR donor) place_country:GB',
  # Medical aid
  f'(ukraine OR #ukraine OR @ukraine OR 🇺🇦) (medical OR supply OR supplies OR food OR emergency OR humanitarian OR medicine OR wounded OR victim OR hospital OR red cross OR hygiene OR healthcare OR zeolite OR oxygen OR shipment) place_country:GB',
  # Misc.
  f'(ukraine OR #ukraine OR @ukraine OR 🇺🇦) (united kingdom OR uk OR 🇬🇧 OR england OR 🏴󠁧󠁢󠁥󠁮󠁧󠁿 OR scotland OR 🏴󠁧󠁢󠁳󠁣󠁴󠁿 OR wales OR 🏴󠁧󠁢󠁷󠁬󠁳󠁿 OR nothern ireland OR boris johnson OR liz truss OR ben wallace OR parliament) place_country:GB']

In [None]:
# Extract entities from tweets
def extract_entities(entity_list):
  """Extract entities from entity list."""
  entities = list()
  if len(entity_list) != 0:
    for item in entity_list:
      for key, value in item.items():
        if key == 'text':
          value = value.lower()
          if value not in entities:
            entities.append(value)
  return ",".join(str(x) for x in entities)

In [None]:
last_30_days_tweets = None

for query in search_30_day_queries:
  results = search_30_day(query, 1000)
  
  if not results.empty:
    results = extract_columns(results, method="search_30_days")
    
    if last_30_days_tweets is None:
      last_30_days_tweets = results
    else:
      last_30_days_tweets =pd.concat([last_30_days_tweets, results], ignore_index=True)
      
last_30_days_tweets['hashtags'] = last_30_days_tweets['hashtags'].apply(extract_entities)

save_csv(last_30_days_tweets, "raw_tweets_30_days")

print(f"Total number of tweets retrieved: {last_30_days_tweets.shape[0]}")

In [None]:
last_30_days_tweets.sample(1)

In [None]:
def get_tweets():
  # Variable to store tweets
  tweets = None

  # Execute search_tweets method for each query
  for query in queries:
    results = search_tweets(query, 1000)
    
    # Check if results are not empty
    if not results.empty:
      results = extract_columns(results)
      
    # Check if tweets variable is None
    if tweets is None:
      tweets = results
    else:
      tweets = pd.concat([tweets, results], ignore_index=True)
      
  # Extract hashtags from DataFrame
  tweets['hashtags'] = tweets['hashtags'].apply(extract_entities)
  
  # Save raw tweets
  save_csv(tweets, "raw_tweets")
  
  # Show number of tweets
  print(f"Total number of tweets retrieved: {tweets.shape[0]}")
  
  # Return twets
  return tweets

#### Data Cleaning

##### Check impurity

In [None]:
import re

# Identify noise
def impurity(text, min_len=10):
  """Returns a share of suspicious characters in a text."""
  RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
  if text == None or len(text) < min_len:
    return 0
  else:
    return len(RE_SUSPICIOUS.findall(text)) / len(text)

##### Normalise text with RegEx

In [None]:
import html

# Remove noise with regular expressions
def regex_normalise_text(text):
  """Remove noise from text with regular expressions."""
  
  # convert html escapes like &amp; to characters
  text = html.unescape(text)
  
  # tags like <tab>
  text = re.sub(r'<[^<>]*>', ' ', text)
  
  # markdown URLs like [Some text](https://....)
  text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
  
  # text or code in brackets like [0]
  text = re.sub(r'\[[^\[\]]*\]', ' ', text)
  
  # standalone sequences of specials, matches &# but not #cool
  text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
  
  # standalone sequences of hyphens like --- or ==
  text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
  
  # sequences of white spaces
  text = re.sub(r'\s+', ' ', text)
  
  # return clened text
  return text.strip()

##### Add spaces between emoji icons

In [None]:
import emoji

# Add spaces between emojis
def add_space_between_emojis(text):
  text = emoji.demojize(text)
  emojis = re.findall(r":+[a-zA-Z0-9_]+:", text)
  for e in emojis:
    text = text.replace(e, ' ' + e + ' ')
  text.replace('  ', ' ')
  return emoji.emojize(text)

##### Normaliase text with Textacy

In [None]:
import textacy.preprocessing as tprep

def normalise_text(text):
  """Normalise text with Textacy."""
  
  # text = tprep.replace.currency_symbols(text)
  text = tprep.replace.urls(text)
  text = tprep.replace.hashtags(text)
  text = tprep.replace.user_handles(text)
  # text = tprep.replace.numbers(text)
  
  text = tprep.normalize.hyphenated_words(text)
  text = tprep.normalize.quotation_marks(text)
  text = tprep.normalize.unicode(text)
  text = tprep.normalize.whitespace(text)
  text = tprep.normalize.repeating_chars(text, chars='?')
  text = tprep.normalize.repeating_chars(text, chars='!')
  text = tprep.normalize.repeating_chars(text, chars='.')
  
  text = tprep.remove.punctuation(text, only='.')
  text = tprep.remove.punctuation(text, only=',')
  text = tprep.remove.punctuation(text, only=':')
  text = tprep.remove.punctuation(text, only=';')
  text = tprep.remove.brackets(text)
  text = tprep.remove.html_tags(text)
  text = tprep.remove.accents(text)
  
  return text

##### Change flag emoji to text

In [None]:
import demoji

# Change flag emoji to text
def flag_to_text(text):
  """Change flag emoji to text."""
  emoji_dict = demoji.findall(text)
  flags_dict = {}
  for emoji, name in emoji_dict.items():
    if 'flag:' in name:
      flags_dict[emoji] = name.replace('flag:', '')
  for flag, name in flags_dict.items():
    if flag in text:
      text = text.replace(flag, name)
  return text

##### Replace characters in text

In [None]:
def custom_clean_text(text):
  """Custom replacement of tokens within text."""
  trash = ('–', '+', '-', '/', '_URL_', '_USER_', '_TAG_', '_NUMBER_', '"', '*', '—', '_', '•')
  for elem in trash:
    text = text.replace(elem, ' ')
  return re.sub(r'\s+', ' ', text)

##### Add spaces between potential words
Don't use it, as it adds spaces between abbreviations and apostrophes.

In [None]:
def add_space_between_potential_words(text):
  """Python3 code to demonstarte working of add space between potential words suing regex() + list comprehension."""
  # printing original list
  text = text.split(' ')
  # using regex() to perform task
  res = [re.sub(r"(\w)([A-Z])", r"\1 \2", ele) for ele in text]
  # printing result
  return ' '.join(res)

##### Case folding text (to lowercase)

In [None]:
from country_list import countries_for_language
list_of_countries = dict(countries_for_language('en'))

def casefolding(text):
  """Transform specific words into lowercase."""
  
  acronyms = ['UK', 'US', 'EU', 'NATO']
  countries = [list_of_countries[country] for country in list_of_countries]
  cities = []
  names = []
  
  for word in text.split(' '):
    if word not in acronyms:
      text = text.replace(word, word.lower())
  
  # for word in text.split(' '):
  #   for match in re.findall(r"([A-Z])\w+/g", text):
  #     if match not in word and match not in acronyms:
  #       text = text.replace(match, word.lower())
        
  return text

##### Remove non-English words
Don't use it, buggy.

In [None]:
import nltk

def remove_non_english_words(text):
  """Remove non-English words with NLTK."""
  words = set(nltk.corpus.words.words())
  return ' '.join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())

##### Standardise country names

In [None]:
def standardise_country_names(text):
  """Standardise country names, e.g., russias --> russia, united kingdom --> uk"""
  # text = text.replace('russian', 'russia')
  # text = text.replace('ukrainian', 'ukraine')
  text = text.replace('russias', 'russia')
  text = text.replace('RussoUkraine', 'Russia Ukraine')
  text = text.replace('russoukraine', 'russia ukraine')
  text = text.replace('ukraines', 'ukraine')
  text = text.replace('united kingdom', 'uk')
  text = text.replace('U K', 'uk')
  text = text.replace('united states', 'us')
  text = text.replace('USA', 'us')
  return text.replace('usa', 'us')

##### Normalise dates

In [None]:
def format_date(date):
  """Format data to YYYY-MM-DD."""
  try:
    # Fri Nov 04 16:26:06 +0000 2022
    date = datetime.strptime(date, "%a %b %d %H:%M:%S %z %Y")
    return date.strftime("%Y-%m-%d")
  except:
    return date

##### Add spaces between words where they potentially should be

In [None]:
def add_space_between_numbers_and_text(text):
  """Add space between numbers and text, e.g., 100million --> 100 million"""
  return re.sub('(\d+(\.\d+)?)', r' \1 ', text)

In [None]:
def add_space_between_apostrophe_and_words(text):
  """Add spaces between apostrophe and word, e.g., then'yes' --> then 'yes', then'okay' --> then 'okay'"""
  for word in text.split(' '):
    for match in re.findall(r"([A-Za-z]{2,}'[A-Za-z]{3,})", text):
      if match in word:
        text = text.replace(match, match.replace("'", " '"))
  return text

In [None]:
def add_space_between_punctuation_and_words(text, rpl='!?.,;:', steps=3):
  """Add spaces between punctuation and right adjacent word, e.g., Hello World!How are you? --> Hello World! How are you?"""
  for _ in range(0, steps):
    for word in text.split(' '):
      for match in re.findall(r"([A-Za-z]+[" + rpl + "][A-Za-z]+)", text):
        if match in word:
          for r in rpl:
            text = text.replace(match, match.replace(r, f"{r} "))
  return text

##### Group all cleaning tasks into one function

In [None]:
def clean_data(tweets):
  # Copy new instance of tweets DataFrame
  tweets_df = tweets.copy()
  # Data cleaning tasks
  tweets_df['text'] = tweets_df['text'].apply(add_space_between_emojis)
  tweets_df['text'] = tweets_df['text'].apply(regex_normalise_text)
  tweets_df['text'] = tweets_df['text'].apply(normalise_text)
  tweets_df['text'] = tweets_df['text'].apply(custom_clean_text)
  tweets_df['text'] = tweets_df['text'].apply(standardise_country_names)
  tweets_df['text'] = tweets_df['text'].apply(add_space_between_numbers_and_text)
  tweets_df['text'] = tweets_df['text'].apply(add_space_between_apostrophe_and_words)
  tweets_df['text'] = tweets_df['text'].apply(add_space_between_punctuation_and_words)
  # tweets_df['text'] = tweets_df['text'].apply(flag_to_text)
  # Check the impurity of data
  tweets_df['impurity'] = tweets_df['text'].apply(impurity)
  # Additional cleaning of other columns
  tweets_df['created_at'] = tweets_df['created_at'].apply(format_date)
  
  # Return clean DataFrame
  return tweets_df

#### Data Collection and Cleaning Pipeline

In [None]:
tweets = get_tweets()
tweets = clean_data(tweets)

In [None]:
# Load existing tweets table
existing_tweets_df = load_sql("tweets_v2", "tweets_v2")
# existing_tweets_df[['text']].sample(3)
print(f"Number of existing tweets: {existing_tweets_df.shape[0]}")

In [None]:
# Merge new tweets with existing tweets
new_tweets_df = pd.concat([existing_tweets_df, tweets]).drop_duplicates()
print(f"Total number of tweets: {new_tweets_df.shape[0]}")

In [None]:
# Save DataFrame to SQLite database
save_sql(new_tweets_df, "tweets_v2")

In [None]:
tweets_df = load_sql("tweets_v2", "tweets_v2")
print(f"Total number of tweets: {tweets_df.shape[0]}")

#### Feature Engineering

##### Lookup Geo Coordinates

In [None]:
import requests, json

def get_location(geo_id:str) -> dict:
  """Get the location details based on the geo_id returned from Twitter API."""
  resp = api.geo_id(geo_id)
  
  lon = resp.centroid[0]
  lat = resp.centroid[1]
  
  resp = requests.get(f"https://api.postcodes.io/postcodes?lon={lon}&lat={lat}&limit=1")
  resp = json.loads(resp.text)
  
  try:
    resp = resp['result'][0]
  
    dict = {
      'geo_id': geo_id,
      'postcode': resp['postcode'],
      'country': resp['country'],
      'longitude': resp['longitude'],
      'latitude': resp['latitude'],
      'region': resp['region'],
      'district': resp['admin_district'],
      'county': resp['admin_county']
    }
    
    return pd.DataFrame.from_dict(dict, orient='index').T
  
  except:
    pass

In [None]:
tweets_df.loc[tweets_df['place_id'] == '457b4814b4240d87']

In [None]:
locations = pd.DataFrame()

for key, tweet in tweets_df[579:].iterrows():
  location = get_location(tweet['place_id'])
  locations = locations.append(location, ignore_index=True)
  print(location)
  
tweets_df = tweets_df.merge(locations, left_on='place_id', right_on='geo_id')
tweets_df.drop('geo_id', axis=1, inplace=True)

In [None]:
# Save locations to database
save_csv(locations, "locations")

In [None]:
locations = load_csv('locations_20221111_125500')
print(locations.shape)
locations.tail(3)

In [None]:
tweets_df.shape

In [None]:
tweets_df2 = tweets_df
print(tweets_df2.shape)

tweets_df2 = tweets_df.reset_index()
locations = locations.reset_index()

In [None]:
tweets_df3 = tweets_df2.merge(locations, how='inner', left_on='place_id', right_on='geo_id')
tweets_df3 = tweets_df3.drop_duplicates(subset=['text'])
tweets_df3.drop(['index_x', 'index_y', 'id_y', 'level_0', 'geo_id', 'impurity'], axis=1, inplace=True)

print(tweets_df3.shape)

In [None]:
t1 = tweets_df[['place_id']]
t2 = tweets_df3[['place_id']]
print(t1.shape, t2.shape)

t3 = pd.concat([t1,t2]).drop_duplicates(keep=False)
print(t3.shape)

In [None]:
locations = pd.DataFrame()

for key, tweet in t3.iterrows():
  location = get_location(tweet['place_id'])
  locations = locations.append(location, ignore_index=True)
  print(location)
  
tweets_df4 = tweets_df.merge(locations, left_on='place_id', right_on='geo_id')
tweets_df4.drop('geo_id', axis=1, inplace=True)

In [None]:
save_csv(locations, 'locations2')

In [None]:
locations = load_csv('locations_20221111_125500')
locations2 = load_csv('locations2_20221117_220719')

In [None]:
locations3 = pd.concat([locations, locations2])
locations3.shape

In [None]:
tweets_df3 = tweets_df.merge(locations3, how='inner', left_on='place_id', right_on='geo_id')
tweets_df3 = tweets_df3.drop_duplicates(subset=['text'])
tweets_df3.drop(['id_y', 'geo_id', 'impurity'], axis=1, inplace=True)

print(tweets_df3.shape)
tweets_df3.sample(3)

In [None]:
save_sql(tweets_df3, 'tweets_v3', 'replace')

### Linguistic Processing with spaCy

In [None]:
tweets_df3 = load_sql('tweets_v3', 'tweets_v3')

In [None]:
# casefolding("The war in Ukraine is getting on nerves in some EU countries. Volodymir is pushing for support. UK is not better. What about Papua New Guinea, do they even care?")

tweets_df3['text'] = tweets_df3['text'].apply(casefolding)
tweets_df3.sample(3)

In [None]:
import spacy

en = spacy.load('en_core_web_sm')
sw_spacy = en.Defaults.stop_words

def remove_stop_words(text):
  words = [word for word in text.split() if word.lower() not in sw_spacy]
  return ' '.join(words)


In [None]:
print(sw_spacy)

In [None]:
tweets_df3['text'] = tweets_df3['text'].apply(remove_stop_words)
tweets_df3.sample(3)

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
  word_list = word_tokenize(text)
  words = [lemmatizer.lemmatize(w) for w in word_list]
  return ' '.join(words)

In [None]:
tweets_df3['text'] = tweets_df3['text'].apply(lemmatize_words)
tweets_df3[['text']].sample(3)

In [None]:
save_sql(tweets_df3, 'tweets_v4')

In [None]:
tweets_df3 = load_sql('tweets_v5', 'tweets_v5')
save_csv(tweets_df3, 'tweets_v5')

In [None]:
from nltk.tokenize import word_tokenize

def unigram_tokens(text):
  tokens = word_tokenize(text)
  tokens = [t for t in tokens]
  return ' | '.join(tokens)

In [None]:
tweets_df3['text_unigrams'] = tweets_df3['text'].apply(unigram_tokens)
tweets_df3[['text_unigrams']].sample(3)

In [None]:
from textacy.extract import token_matches

nlp = spacy.load('en_core_web_sm')

# Complete function for noun phrase extraction based on PoS patterns
def extract_noun_phrases(text):
  doc = nlp(text)
  patterns = []
  preceding_pos = ['NOUN']
  
  for pos in preceding_pos:
    # patterns.append([{"POS": pos}, {"POS": "NOUN", "OP": "+"}])
    patterns.append(f"POS:{pos} POS:NOUN:+")
    
  spans = token_matches(doc, patterns=patterns)
  return ['_'.join([t.lemma_ for t in s]) for s in spans]
  

In [None]:
tweets_df3['text_bigrams'] = tweets_df3['text'].apply(extract_noun_phrases)

In [None]:
tweets_df3[['text', 'text_bigrams']].sample(3)