# Preprocessing Data on Twitter

## 1. Importing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
tweets = pd.read_csv('data/data_collection_tweets.csv')

In [3]:
tweets

Unnamed: 0,date,content
0,2017-01-01 23:59:23+00:00,#solarenergy #solarpower #Solarroof #Teslaroof...
1,2017-01-01 23:55:25+00:00,@realDonaldTrump Do you believe in #climatechange
2,2017-01-01 23:54:39+00:00,"Belief in #ClimateChange, #IntelligenceAgencie..."
3,2017-01-01 23:54:19+00:00,US #Wisconsin—Dept Natural Resources' website ...
4,2017-01-01 23:54:11+00:00,The latest GreenerRob's Daily! https://t.co/Yy...
...,...,...
246987,2020-01-31 23:00:42+00:00,Don't let the Council on Environmental Quality...
246988,2020-01-31 23:00:00+00:00,"Without the National Environmental Policy Act,..."
246989,2020-01-31 22:57:11+00:00,@SierraClub @bruneski Environmental Policy Act...
246990,2020-01-31 22:52:30+00:00,The National Environmental Policy Act is under...


## 2. Preprocessing Data

### 2.1. Cleaning Data

In [4]:
# Convert datetime data type
tweets['date'] = pd.to_datetime(tweets['date']).dt.date

### 2.2. Preprocessing Text

In [5]:
# Set environment for text preprocessing
import re
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords and punkt
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcelzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Replace missing values with an empty string
tweets['content'] = tweets['content'].fillna("")

In [7]:
# define a function to preprocess text
def clean_text(text):
    # Remove all URLs
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'https\S+', ' ', text)
    # Remove all '@' and '#'
    text = re.sub(r'[@#]', ' ', text)
    # Remove all emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove all non-Unicode characters
    #text = re.sub(r'[^\p{L}\s]', ' ', text)
    # Replace all underscores with spaces
    text = text.replace('_', ' ')
    # Remove all extra whitespaces
    text = re.sub(r'\s{2,}', ' ', text).strip()
    # Remove all stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    return text

In [8]:
# apply the function to preprocess text
tweets['clean_text'] = tweets.content.apply(clean_text)

In [9]:
# remove duplicate based on clean_text
tweets.drop_duplicates(subset=['clean_text'], keep='first', inplace=True)

In [10]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 218104 entries, 0 to 246989
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        218104 non-null  object
 1   content     218104 non-null  object
 2   clean_text  218104 non-null  object
dtypes: object(3)
memory usage: 6.7+ MB


In [11]:
tweets.head(30)

Unnamed: 0,date,content,clean_text
0,2017-01-01,#solarenergy #solarpower #Solarroof #Teslaroof...,solarenergy solarpower Solarroof Teslaroof ren...
1,2017-01-01,@realDonaldTrump Do you believe in #climatechange,realDonaldTrump Do believe climatechange
2,2017-01-01,"Belief in #ClimateChange, #IntelligenceAgencie...","Belief ClimateChange, IntelligenceAgencies , E..."
3,2017-01-01,US #Wisconsin—Dept Natural Resources' website ...,US WisconsinDept Natural Resources' website de...
4,2017-01-01,The latest GreenerRob's Daily! https://t.co/Yy...,The latest GreenerRob's Daily! climatechange
5,2017-01-01,The #climatechange hoax has been doing the rou...,The climatechange hoax rounds years yet im sti...
6,2017-01-01,#nswpol #westconnex #climatechange https://t.c...,nswpol westconnex climatechange
7,2017-01-01,.@4e4_Network \nAt least we fixed #climatechan...,. 4e4 Network At least fixed climatechange ......
8,2017-01-01,My @journalsentinel column this wk on climate ...,My journalsentinel column wk climate change de...
9,2017-01-01,The Indian Express After Donald Trump blows to...,The Indian Express After Donald Trump blows cl...


### 2.3. Removing Short Text

In [12]:
# calculate length of each text
tweets['length'] = list(map(lambda x: len(str(x).split()), tweets['clean_text']))

In [13]:
tweets.describe()

Unnamed: 0,length
count,218104.0
mean,16.992109
std,8.842496
min,0.0
25%,11.0
50%,15.0
75%,23.0
max,99.0


In [14]:
# get the first quantile of the length column
q1 = tweets['length'].quantile(q=0.25)
q1

11.0

In [15]:
tweets = tweets[tweets['length'] > q1]
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150679 entries, 2 to 246989
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        150679 non-null  object
 1   content     150679 non-null  object
 2   clean_text  150679 non-null  object
 3   length      150679 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 5.7+ MB


In [16]:
tweets

Unnamed: 0,date,content,clean_text,length
2,2017-01-01,"Belief in #ClimateChange, #IntelligenceAgencie...","Belief ClimateChange, IntelligenceAgencies , E...",13
3,2017-01-01,US #Wisconsin—Dept Natural Resources' website ...,US WisconsinDept Natural Resources' website de...,12
5,2017-01-01,The #climatechange hoax has been doing the rou...,The climatechange hoax rounds years yet im sti...,14
8,2017-01-01,My @journalsentinel column this wk on climate ...,My journalsentinel column wk climate change de...,12
9,2017-01-01,The Indian Express After Donald Trump blows to...,The Indian Express After Donald Trump blows cl...,16
...,...,...,...,...
246983,2020-01-31,The National Environmental Policy Act ensures ...,The National Environmental Policy Act ensures ...,27
246984,2020-01-31,"@Amy_Siskind Objectively, if Dems are serious ...","Amy Siskind Objectively, Dems serious making c...",28
246986,2020-01-31,New #Job : Environmental Policy Associate - Fr...,New Job : Environmental Policy Associate - Fro...,13
246988,2020-01-31,"Without the National Environmental Policy Act,...","Without National Environmental Policy Act, can...",24


In [17]:
# sort the DataFrame tweets_df by date in ascending order
tweets = tweets.sort_values('date')

In [18]:
tweets.reset_index(drop = True)

Unnamed: 0,date,content,clean_text,length
0,2017-01-01,"Belief in #ClimateChange, #IntelligenceAgencie...","Belief ClimateChange, IntelligenceAgencies , E...",13
1,2017-01-01,Climate-Denying Trump's GOP Undermine The Cons...,Climate-Denying Trump's GOP Undermine The Cons...,18
2,2017-01-01,"'Instead of focusing on austerity alone, we se...","'Instead focusing austerity alone, see real ch...",14
3,2017-01-01,Totally agree. Are we looking to reduce carbon...,Totally agree. Are looking reduce carbon footp...,12
4,2017-01-01,@RachelNotley @shoffmanAB just want to make su...,RachelNotley shoffmanAB want make sure I freez...,12
...,...,...,...,...
150674,2020-01-31,Wetland biodiversity matters for life.\n2 Febr...,Wetland biodiversity matters life.\n2 February...,20
150675,2020-01-31,Please support @savewarrenfarm. Spread the wor...,Please support savewarrenfarm. Spread word don...,18
150676,2020-01-31,"150,000 Botanical and Animal Illustrations Ava...","150,000 Botanical Animal Illustrations Availab...",12
150677,2020-01-31,I thought it was all just a really bad dream.....,"I thought really bad dream... wars, al-Assad, ...",30


In [19]:
tweets.to_csv('data/tweets_clean.csv', index=False)