In [15]:
import pandas as pd

In [16]:
df = pd.read_csv('top100_climatechange.csv')

In [17]:
import re

def clean_text(text):

    # Remove all URLs
    text = re.sub(r'http\S+', '', text)
    # Remove all mentions
    text = re.sub(r'@\w+', '', text)
    # Remove all hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove all punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove all digits
    text = re.sub(r'\d+', '', text)
    # Remove all sentences that start with ">"
    text = re.sub(r'^>', '', text, flags=re.M)
    # Remove all emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove all extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# Replace missing values with an empty string
df['Text'] = df['Text'].fillna("")

# Apply preprocess_text function to the DataFrame
df['Cleaned_Text'] = df['Text'].apply(clean_text)

In [18]:
df

Unnamed: 0,Title,Text,Cleaned_Text
0,I'm afraid climate change is going to kill me!...,Feeling scared? Have you been listening to or ...,Feeling scared Have you been listening to or r...
1,,Let's use this space to discuss some of the mo...,Lets use this space to discuss some of the mos...
2,,> Feeling scared? Have you been listening to o...,Feeling scared Have you been listening to or r...
3,,So I know this is a late response to this thre...,So I know this is a late response to this thre...
4,,climate change isn't just going to pop up and ...,climate change isnt just going to pop up and m...
...,...,...,...
5070,,Figure 30 has 100 MW installations at 0.94 per...,Figure has MW installations at per Watt fixed ...
5071,,There are lots of ways to reduce water lost to...,There are lots of ways to reduce water lost to...
5072,,"Cyanide CN, carbon dioxide CO2. We can refer t...",Cyanide CN carbon dioxide CO We can refer to b...
5073,,There was a study with a compound in both coff...,There was a study with a compound in both coff...


## removing text which is too short

In [20]:
df['length'] = list(map(lambda x: len(str(x).split()), df['Cleaned_Text']))

m = df['length'].median()
m

df = df[df['length'] > m]
df.shape

(2502, 4)

In [21]:
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords and punkt
nltk.download('stopwords')
nltk.download('punkt')

# Define function to preprocess text
def preprocess_text(text):
    # Check if text is NaN
    if type(text) != str:
        return ""
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    # Remove duplicates
    words = list(set(words))
    # Join words back into a string
    text = ' '.join(words)
    return text

# Apply preprocess_text function to the DataFrame
df['Preprocessed_Text'] = df['Cleaned_Text'].apply(preprocess_text)

# Print the cleaned DataFrame
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/SimoneBroggini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/SimoneBroggini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                               Title  \
0  I'm afraid climate change is going to kill me!...   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                                                Text  \
0  Feeling scared? Have you been listening to or ...   
1  Let's use this space to discuss some of the mo...   
2  > Feeling scared? Have you been listening to o...   
3  So I know this is a late response to this thre...   
4  climate change isn't just going to pop up and ...   

                                        Cleaned_Text  length  \
0  Feeling scared Have you been listening to or r...      93   
1  Lets use this space to discuss some of the mos...      54   
2  Feeling scared Have you been listening to or r...      33   
3  So I know this is a late response to this thre...     648   
4  cli

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Preprocessed_Text'] = df['Cleaned_Text'].apply(preprocess_text)


In [22]:
df

Unnamed: 0,Title,Text,Cleaned_Text,length,Preprocessed_Text
0,I'm afraid climate change is going to kill me!...,Feeling scared? Have you been listening to or ...,Feeling scared Have you been listening to or r...,93,related concerns like answers even kill thread...
1,,Let's use this space to discuss some of the mo...,Lets use this space to discuss some of the mos...,54,concerns pose free people structure well inclu...
2,,> Feeling scared? Have you been listening to o...,Feeling scared Have you been listening to or r...,33,one sources arent family vibes humanity infome...
3,,So I know this is a late response to this thre...,So I know this is a late response to this thre...,648,anxious scenario lot recap viewpoints like ass...
4,,climate change isn't just going to pop up and ...,climate change isnt just going to pop up and m...,876,economic scenario embracing needed periods lot...
...,...,...,...,...,...
5066,,"They are not floating. What do you mean bin ""...",They are not floating What do you mean bin ant...,51,find sensitivity ecosystems floating thing cal...
5068,,"According to Wikipedia, the free encyclopedia ...",According to Wikipedia the free encyclopedia t...,38,edit cocoa anyone removed cyanide ill wikipedi...
5070,,Figure 30 has 100 MW installations at 0.94 per...,Figure has MW installations at per Watt fixed ...,32,doesnt different representation even axis youv...
5071,,There are lots of ways to reduce water lost to...,There are lots of ways to reduce water lost to...,38,lots edit reduce lost water solar ethods reduc...


In [23]:
df.to_csv('test_3.csv', index=False)