In [1]:
import pandas as pd
import string

In [2]:
df = pd.read_csv("Twitter Sentiments.csv")

df = df.drop(columns=['id', 'label'], axis=1)
df.head()

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty
3,#model i love u take with u all the time in ...
4,factsguide: society now #motivation


### Convert a LowerCase

In [3]:
df['clean_text'] = df['tweet'].str.lower()
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,factsguide: society now #motivation,factsguide: society now #motivation


### Remove Punctuations

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

In [6]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuations(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so...
1,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in u...
4,factsguide: society now #motivation,factsguide society now motivation


### Remove StopWords

In [7]:
import nltk

In [8]:
from nltk.corpus import stopwords
",".join(stopwords.words('english'))

"a,about,above,after,again,against,ain,all,am,an,and,any,are,aren,aren't,as,at,be,because,been,before,being,below,between,both,but,by,can,couldn,couldn't,d,did,didn,didn't,do,does,doesn,doesn't,doing,don,don't,down,during,each,few,for,from,further,had,hadn,hadn't,has,hasn,hasn't,have,haven,haven't,having,he,he'd,he'll,her,here,hers,herself,he's,him,himself,his,how,i,i'd,if,i'll,i'm,in,into,is,isn,isn't,it,it'd,it'll,it's,its,itself,i've,just,ll,m,ma,me,mightn,mightn't,more,most,mustn,mustn't,my,myself,needn,needn't,no,nor,not,now,o,of,off,on,once,only,or,other,our,ours,ourselves,out,over,own,re,s,same,shan,shan't,she,she'd,she'll,she's,should,shouldn,shouldn't,should've,so,some,such,t,than,that,that'll,the,their,theirs,them,themselves,then,there,these,they,they'd,they'll,they're,they've,this,those,through,to,too,under,until,up,ve,very,was,wasn,wasn't,we,we'd,we'll,we're,were,weren,weren't,we've,what,when,where,which,while,who,whom,why,will,with,won,won't,wouldn,wouldn't,y,you,you'd,you

In [9]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [10]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time urð± ðððð...
4,factsguide: society now #motivation,factsguide society motivation


### Remove Frequent Words

In [11]:
from collections import Counter
word_counter = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_counter[word] +=1
word_counter.most_common(10)

[('user', 17473),
 ('love', 2647),
 ('day', 2198),
 ('happy', 1663),
 ('amp', 1582),
 ('im', 1139),
 ('u', 1136),
 ('time', 1110),
 ('life', 1086),
 ('like', 1042)]

In [13]:
FREQUENT_WORDS = set(word for (word, wc) in word_counter.most_common(3))
def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

In [14]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_freq_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


### Remove Rare Words

In [20]:
RARE_WORDS = set(word for (word, wc) in word_counter.most_common()[: -10: -1])
def remove_least_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])
RARE_WORDS

{'airwaves',
 'carnt',
 'chisolm',
 'ibizabringitonmallorcaholidayssummer',
 'isz',
 'mantle',
 'shirley',
 'youuuð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dâ\x9d¤ï¸\x8f',
 'ð\x9f\x99\x8fð\x9f\x8f¼ð\x9f\x8d¹ð\x9f\x98\x8eð\x9f\x8eµ'}

In [19]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_least_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


### Removal Special Chracters

In [22]:
import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', " " , text)
    text = re.sub('\\s+', ' ', text)
    return text

In [23]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


### Remove URLs

In [24]:
text = "https://www.hackersrealm.net/ is the url of the channel hackers Realm"

In [39]:
def remove_url(text):
    return re.sub(r"https?://\S+|www\.\S+", '', text)

In [40]:
remove_url(text)

' is the url of the channel hackers Realm'

### Remove HTMLs

In [42]:
html_example = "<p>This is <b>HTML</b> content with <a href='https://example.com'>links</a> and <br> tags.</p>"

In [47]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

In [46]:
remove_html_tags(html_example)

''

Num2Words

In [3]:
import spacy
import re
from num2words import num2words

nlp = spacy.blank("tr")

def preprocess_text(text):
    # Sayıları kelimeye çevir
    def replace_numbers(match):
        num = int(match.group(0))
        return num2words(num, lang='tr')
    
    # Regex ile sayıları bul ve değiştir
    text = re.sub(r'\d+', replace_numbers, text)
    # Büyük-küçük harf normalizasyonu
    text = text.lower()
    # Tokenizasyon
    doc = nlp(text)
    # Stop words kaldırma (opsiyonel)
    stop_words = ["ve", "ile", "bir", "da", "de"]
    tokens = [token.text for token in doc if token.text not in stop_words]
    return tokens

# Test
text = "Ali 23 yaşında ve Veli 45 yaşında."
print(preprocess_text(text))
# Çıktı: ['ali', 'yirmi', 'üç', 'yaşında', 'veli', 'kırk', 'beş', 'yaşında']

['ali', 'yirmiüç', 'yaşında', 'veli', 'kırkbeş', 'yaşında', '.']
