In [2]:
#Load the packages
import pandas as pd
import string
import re
import nltk
import spacy
from nltk.corpus import stopwords

In [27]:
# Download necessary resources
nltk.download('stopwords')
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading en_core_web_sm model...")
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
# Load dataset
file_path = "twitterdd.csv"
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File not found at '{file_path}'. Please provide the correct path.")
    exit()

In [29]:
df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [30]:
#Let check the Text column
df.text.sample(10)

Unnamed: 0,text
1444,This is Trevith. He's a Swiss Mountain Roadwoo...
1635,Someone help the girl is being mugged. Several...
1720,Say hello to Kawhi. He was doing fine until hi...
1046,Say hello to Bentley and Millie. They do every...
26,This is Maya. She's very shy. Rarely leaves he...
1493,"""Hello yes could I get one pupper to go please..."
1241,This is Chester. He's clearly in charge of the...
625,This is Brody. He's trying to make the same fa...
251,PUPDATE: I'm proud to announce that Toby is 23...
266,RT @dog_rates: This is Ken. His cheeks are mag...


In [31]:
#Let check the sources column
df.source.sample(10)

Unnamed: 0,source
547,"<a href=""http://twitter.com/download/iphone"" r..."
106,"<a href=""http://twitter.com/download/iphone"" r..."
1636,"<a href=""http://twitter.com/download/iphone"" r..."
1630,"<a href=""http://twitter.com/download/iphone"" r..."
1560,"<a href=""http://twitter.com/download/iphone"" r..."
734,"<a href=""http://twitter.com/download/iphone"" r..."
1711,"<a href=""http://twitter.com/download/iphone"" r..."
1733,"<a href=""http://twitter.com/download/iphone"" r..."
800,"<a href=""http://twitter.com/download/iphone"" r..."
2320,"<a href=""http://twitter.com/download/iphone"" r..."


In [32]:
# Remove specified columns
columns_to_remove = ['in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
                     'retweeted_status_id', 'retweeted_status_user_id',
                     'retweeted_status_timestamp', 'expanded_urls']
df = df.drop(columns=columns_to_remove, errors='ignore')


text_column = "text"
source_column = "source"

# Text preprocessing functions
def preprocess_text(text):
    """Applies all text preprocessing steps in sequence."""
    try:
        text = text.lower()
        text = remove_urls(text)
        text = remove_punctuation(text)
        text = remove_stopwords(text)
        text = remove_emojis(text)
        text = lemmatize_text(text)
        return text
    except AttributeError:
        return "" # Return empty string for non-string values


def remove_urls(text):
    """Remove URLs from text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_punctuation(text):
    """Remove punctuation from text."""
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    """Remove stopwords from text."""
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def remove_emojis(text):
    """Remove emojis from text."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def lemmatize_text(text):
    """Lemmatize text using spaCy."""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop]) #Remove stop words during lemmatization

def clean_source(source):
    """Extract text between > and </a> in the source column."""
    try:
        match = re.search(r'>(.*?)</a>', source)
        return match.group(1) if match else source
    except AttributeError:
        return source

# Apply text preprocessing
df[text_column] = df[text_column].astype(str).apply(preprocess_text)

# Clean source column if present
if source_column in df.columns:
    df[source_column] = df[source_column].astype(str).apply(clean_source)


In [33]:
# Save cleaned dataset
df.to_csv("cleaned_twitter.csv", index=False)
print("Preprocessing complete. Cleaned data saved as cleaned_twitter.csv")

Preprocessing complete. Cleaned data saved as cleaned_twitter.csv


In [34]:
#View the head of my dataset
clean_data = pd.read_csv("cleaned_twitter.csv")
clean_data.head()

Unnamed: 0,tweet_id,source,text,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,Twitter for iPhone,phinea s mystical boy appear hole donut 1310,13,10,Phineas,,,,
1,892177421306343426,Twitter for iPhone,tilly s check pup hope ok s available pat snug...,13,10,Tilly,,,,
2,891815181378084864,Twitter for iPhone,archie rare norwegian pounce corgo live tall g...,12,10,Archie,,,,
3,891689557279858688,Twitter for iPhone,darla commence snooze mid meal 1310 happen good,13,10,Darla,,,,
4,891327558926688256,Twitter for iPhone,franklin like stop call cute fierce shark resp...,12,10,Franklin,,,,


In [35]:
clean_data.text.sample(10)

Unnamed: 0,text
1490,fuck system 1010
1672,meet brody s downton abbey falsetto addict gra...
699,arnie s afraid bark 1210 comfort
2063,anthony finish master harvard unprofessional t...
1722,ozzy wake 2 minute ready christmas party 910 c...
1315,hello katie s mitsubishi hufflepuff curly af 1...
678,stella s happy 1010 trade life
1712,uncover entire battalion holiday pupper averag...
2337,concerned fellow dog trap computer 1010
1154,pup remove cuz not fair opposing team 1310 abs...


In [36]:
clean_data.source.sample(10)

Unnamed: 0,source
1934,Twitter for iPhone
1871,Twitter for iPhone
824,Twitter for iPhone
2239,Twitter for iPhone
976,TweetDeck
2213,Twitter for iPhone
1890,Twitter for iPhone
419,Twitter for iPhone
2234,Twitter for iPhone
242,Twitter for iPhone
