In [1]:
import numpy as np
import pandas as pd

In [2]:
tweets_df  = pd.read_csv('C:/Users/dengd/Documents/GitHub/Final_project/climate_tweets.csv')

In [3]:
tweets_df 

Unnamed: 0,date,content
0,2017-01-01 23:59:23+00:00,#solarenergy #solarpower #Solarroof #Teslaroof...
1,2017-01-01 23:55:25+00:00,@realDonaldTrump Do you believe in #climatechange
2,2017-01-01 23:54:39+00:00,"Belief in #ClimateChange, #IntelligenceAgencie..."
3,2017-01-01 23:54:19+00:00,US #Wisconsin—Dept Natural Resources' website ...
4,2017-01-01 23:54:11+00:00,The latest GreenerRob's Daily! https://t.co/Yy...
...,...,...
246987,2020-01-31 23:00:42+00:00,Don't let the Council on Environmental Quality...
246988,2020-01-31 23:00:00+00:00,"Without the National Environmental Policy Act,..."
246989,2020-01-31 22:57:11+00:00,@SierraClub @bruneski Environmental Policy Act...
246990,2020-01-31 22:52:30+00:00,The National Environmental Policy Act is under...


In [4]:
import re

def clean_text(text):

    # Remove all URLs
    text = re.sub(r'http\S+', '', text)
    # Remove all mentions
    text = re.sub(r'@\w+', '', text)
    # Remove all hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove all punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove all digits
    text = re.sub(r'\d+', '', text)
    # Remove all sentences that start with ">"
    text = re.sub(r'^>', '', text, flags=re.M)
    # Remove all emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove all extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# Replace missing values with an empty string
tweets_df ['content'] = tweets_df ['content'].fillna("")

# Apply preprocess_text function to the DataFrame
tweets_df ['Cleaned_Text'] = tweets_df ['content'].apply(clean_text)

In [5]:
tweets_df 

Unnamed: 0,date,content,Cleaned_Text
0,2017-01-01 23:59:23+00:00,#solarenergy #solarpower #Solarroof #Teslaroof...,
1,2017-01-01 23:55:25+00:00,@realDonaldTrump Do you believe in #climatechange,Do you believe in
2,2017-01-01 23:54:39+00:00,"Belief in #ClimateChange, #IntelligenceAgencie...",Belief in if knows what it is But Tweets
3,2017-01-01 23:54:19+00:00,US #Wisconsin—Dept Natural Resources' website ...,US Dept Natural Resources website deletes info...
4,2017-01-01 23:54:11+00:00,The latest GreenerRob's Daily! https://t.co/Yy...,The latest GreenerRobs Daily
...,...,...,...
246987,2020-01-31 23:00:42+00:00,Don't let the Council on Environmental Quality...,Dont let the Council on Environmental Quality ...
246988,2020-01-31 23:00:00+00:00,"Without the National Environmental Policy Act,...",Without the National Environmental Policy Act ...
246989,2020-01-31 22:57:11+00:00,@SierraClub @bruneski Environmental Policy Act...,Environmental Policy Act is not democracy in a...
246990,2020-01-31 22:52:30+00:00,The National Environmental Policy Act is under...,The National Environmental Policy Act is under...


## removing text which is too short

In [6]:
tweets_df['length'] = list(map(lambda x: len(str(x).split()), tweets_df ['Cleaned_Text']))

m = tweets_df ['length'].median()
m

tweets_df  = tweets_df[tweets_df['length'] > m]
tweets_df.shape

(122746, 4)

In [7]:
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords and punkt
nltk.download('stopwords')
nltk.download('punkt')

# Define function to preprocess text
def preprocess_text(text):
    # Check if text is NaN
    if type(text) != str:
        return ""
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    # Remove duplicates
    words = list(set(words))
    # Join words back into a string
    text = ' '.join(words)
    return text

# Apply preprocess_text function to the DataFrame
tweets_df ['Preprocessed_Text'] = tweets_df ['Cleaned_Text'].apply(preprocess_text)

# Print the cleaned DataFrame
print(tweets_df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dengd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dengd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                         date  \
5   2017-01-01 23:50:22+00:00   
14  2017-01-01 23:44:32+00:00   
39  2017-01-02 23:49:48+00:00   
48  2017-01-03 23:52:58+00:00   
74  2017-01-04 23:48:43+00:00   

                                              content  \
5   The #climatechange hoax has been doing the rou...   
14  Let's all join defending non-human animal righ...   
39  . @deepgreendesign I guess some people just re...   
48  Learned about the huge impact of #climatechang...   
74  Ég hætti einu sinni að deita gæja þvi hann trú...   

                                         Cleaned_Text  length  \
5   The hoax has been doing the rounds for years y...      26   
14  Lets all join defending nonhuman animal rights...      18   
39  I guess some people just really want to be a p...      22   
48  Learned about the huge impact of on migration ...      20   
74  g htti einu sinni a deita gja vi hann tri ekki...      25   

                                    Preprocessed_Text  
5   years h

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df ['Preprocessed_Text'] = tweets_df ['Cleaned_Text'].apply(preprocess_text)


In [8]:
# drop duplicate rows 
tweets_df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df.drop_duplicates(inplace=True)


In [9]:
# sort the DataFrame tweets_df by date in ascending order
tweets_df = tweets_df.sort_values('date')

In [10]:
tweets_df.reset_index(drop = True)

Unnamed: 0,date,content,Cleaned_Text,length,Preprocessed_Text
0,2017-01-01 01:45:02+00:00,Climate Crisis Reading List &gt; How do we thi...,Climate Crisis Reading List gt How do we think...,18,crisis jan think climate catastrophe gt list b...
1,2017-01-01 03:00:06+00:00,"@realDonaldTrump #Donald, there's no climate c...",theres no climate crisis talk to Rohraquacker ...,19,iceberg b talk crisis hell rohraquacker climat...
2,2017-01-01 12:24:18+00:00,When Donald Trump takes office he will face a ...,When Donald Trump takes office he will face a ...,20,crisis takes face seriously world office trump...
3,2017-01-01 15:46:47+00:00,Except this bravado ignores that RW populists ...,Except this bravado ignores that RW populists ...,18,populists climate must oppose except threat rw...
4,2017-01-01 16:45:01+00:00,Climate Crisis Reading List &gt; How do we thi...,Climate Crisis Reading List gt How do we think...,18,crisis jan think climate catastrophe gt list b...
...,...,...,...,...,...
121234,2020-01-31 23:58:01+00:00,The amazing Greta Thunberg - an incredibly mat...,The amazing Greta Thunberg an incredibly matur...,31,board get impact mature greta amazing australi...
121235,2020-01-31 23:58:26+00:00,@BackupMister is to take exclusivity contracts...,is to take exclusivity contracts like Crossfir...,30,company projects worry contracts control witho...
121236,2020-01-31 23:59:00+00:00,@mattgallowaycbc You failed to challenge any a...,You failed to challenge any assumptions propos...,35,crisis position climate disservice failed envi...
121237,2020-01-31 23:59:34+00:00,The Clínica del Sur health center is located i...,The Clnica del Sur health center is located in...,23,sur sumac located del leed pursuing clnica med...


In [11]:
tweets_df .to_csv('tweets_df_cleaned.csv', index=False)