In [1]:
import pandas as pd

# Load CSV
df = pd.read_csv('/content/sample.csv')

# Convert text column to lowercase
df['text_lower'] = df['text'].str.lower()

print(df[['text', 'text_lower']].head())


                                                text  \
0  @AppleSupport causing the reply to be disregar...   
1  @105835 Your business means a lot to us. Pleas...   
2  @76328 I really hope you all change but I'm su...   
3  @105836 LiveChat is online at the moment - htt...   
4  @VirginTrains see attached error message. I've...   

                                          text_lower  
0  @applesupport causing the reply to be disregar...  
1  @105835 your business means a lot to us. pleas...  
2  @76328 i really hope you all change but i'm su...  
3  @105836 livechat is online at the moment - htt...  
4  @virgintrains see attached error message. i've...  


In [2]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['no_punctuation'] = df['text_lower'].apply(remove_punctuation)

print(df[['text_lower', 'no_punctuation']].head())


                                          text_lower  \
0  @applesupport causing the reply to be disregar...   
1  @105835 your business means a lot to us. pleas...   
2  @76328 i really hope you all change but i'm su...   
3  @105836 livechat is online at the moment - htt...   
4  @virgintrains see attached error message. i've...   

                                      no_punctuation  
0  applesupport causing the reply to be disregard...  
1  105835 your business means a lot to us please ...  
2  76328 i really hope you all change but im sure...  
3  105836 livechat is online at the moment  https...  
4  virgintrains see attached error message ive tr...  


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Added to resolve LookupError

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered)

df['no_stopwords'] = df['no_punctuation'].apply(remove_stopwords)

print(df[['no_punctuation', 'no_stopwords']].head())

                                      no_punctuation  \
0  applesupport causing the reply to be disregard...   
1  105835 your business means a lot to us please ...   
2  76328 i really hope you all change but im sure...   
3  105836 livechat is online at the moment  https...   
4  virgintrains see attached error message ive tr...   

                                        no_stopwords  
0  applesupport causing reply disregarded tapped ...  
1  105835 business means lot us please dm name zi...  
2         76328 really hope change im sure wont dont  
3  105836 livechat online moment httpstcosy94vtu8...  
4  virgintrains see attached error message ive tr...  


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [6]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def stem_text(text):
    words = word_tokenize(text)
    stemmed = [ps.stem(word) for word in words]
    return " ".join(stemmed)

sample_sentence = "running runs easily studies studying"
print("Original:", sample_sentence)
print("Stemmed:", stem_text(sample_sentence))


Original: running runs easily studies studying
Stemmed: run run easili studi studi


In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")

def spacy_lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

print("SpaCy Lemmatized:", spacy_lemmatize(sample_sentence))


SpaCy Lemmatized: run run easily study study


In [8]:
import re

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text)

df['no_urls'] = df['text'].apply(remove_urls)

print(df[['text', 'no_urls']].head())


                                                text  \
0  @AppleSupport causing the reply to be disregar...   
1  @105835 Your business means a lot to us. Pleas...   
2  @76328 I really hope you all change but I'm su...   
3  @105836 LiveChat is online at the moment - htt...   
4  @VirginTrains see attached error message. I've...   

                                             no_urls  
0  @AppleSupport causing the reply to be disregar...  
1  @105835 Your business means a lot to us. Pleas...  
2  @76328 I really hope you all change but I'm su...  
3  @105836 LiveChat is online at the moment -  or...  
4  @VirginTrains see attached error message. I've...  


In [9]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

df['no_html'] = df['text'].apply(remove_html_tags)

print(df[['text', 'no_html']].head())


                                                text  \
0  @AppleSupport causing the reply to be disregar...   
1  @105835 Your business means a lot to us. Pleas...   
2  @76328 I really hope you all change but I'm su...   
3  @105836 LiveChat is online at the moment - htt...   
4  @VirginTrains see attached error message. I've...   

                                             no_html  
0  @AppleSupport causing the reply to be disregar...  
1  @105835 Your business means a lot to us. Pleas...  
2  @76328 I really hope you all change but I'm su...  
3  @105836 LiveChat is online at the moment - htt...  
4  @VirginTrains see attached error message. I've...  


In [11]:
pip install emoji


Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m608.4/608.4 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0


In [12]:
import emoji

def convert_emojis(text):
    return emoji.demojize(text)

sample_emoji_text = "I love NLP üòçüî•"
print("Original:", sample_emoji_text)
print("Converted:", convert_emojis(sample_emoji_text))


Original: I love NLP üòçüî•
Converted: I love NLP :smiling_face_with_heart-eyes::fire:
