In [1]:
import pandas as pd
import pyarrow.parquet as pq
import re
import emoji
from nltk.corpus import stopwords
from symspellpy import SymSpell
import pkg_resources
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

In [2]:
DICT_PATH = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")

In [3]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bora\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Bora\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

---

In [2]:
ANON_CONTROL_MUSICS = 'data/anon_control_musics.csv'
ANON_CONTROL_TWEETS = 'data/anon_control_tweets.csv'
ANON_DISORDER_MUSICS = 'data/anon_disorder_musics.csv'
ANON_DISORDER_TWEETS = 'data/anon_disorder_tweets.csv'

In [3]:
control_music = pd.read_csv(ANON_CONTROL_MUSICS, dtype=str)
control_music = control_music.drop(control_music.columns[0], axis=1)

control_tweets = pd.read_csv(ANON_CONTROL_TWEETS, dtype=str)
control_tweets = control_tweets.drop(control_tweets.columns[0], axis=1)

disorder_music = pd.read_csv(ANON_DISORDER_MUSICS, dtype=str)
disorder_music = disorder_music.drop(disorder_music.columns[0], axis=1)

disorder_tweets = pd.read_csv(ANON_DISORDER_TWEETS, dtype=str)
disorder_tweets = disorder_tweets.drop(disorder_tweets.columns[0], axis=1)

In [4]:
control_music.to_parquet('data/anon_control_musics.parquet')
control_tweets.to_parquet('data/anon_control_tweets.parquet')
disorder_music.to_parquet('data/anon_disorder_musics.parquet')
disorder_tweets.to_parquet('data/anon_disorder_tweets.parquet')

---

In [4]:
ANON_CONTROL_MUSICS = 'data/anon_control_musics.parquet'
ANON_CONTROL_TWEETS = 'data/anon_control_tweets.parquet'
ANON_DISORDER_MUSICS = 'data/anon_disorder_musics.parquet'
ANON_DISORDER_TWEETS = 'data/anon_disorder_tweets.parquet'

In [5]:
control_music = pq.read_table(ANON_CONTROL_MUSICS).to_pandas()
control_tweets = pq.read_table(ANON_CONTROL_TWEETS).to_pandas()
disorder_music = pq.read_table(ANON_DISORDER_MUSICS).to_pandas()
disorder_tweets = pq.read_table(ANON_DISORDER_TWEETS).to_pandas()

In [6]:
control_music = control_music.dropna()
control_tweets = control_tweets.dropna()
disorder_music = disorder_music.dropna()
disorder_tweets = disorder_tweets.dropna()

In [7]:
control_tweets

Unnamed: 0,user_id,text,disorder
0,54c6d3e322,"My greatest year month so far, since Monday bo...",control
1,54c6d3e322,🙏🙏🙏 https://t.co/t07nMOuV3P,control
2,54c6d3e322,@real_mercyeke Classic women 💧💧,control
3,54c6d3e322,One day can change your whole life. ☝🏾 \n\nWha...,control
4,54c6d3e322,Happy birthday to me!!!! My bone straight na d...,control
...,...,...,...
32399489,ace96f1f11,@GoddessLeah10 @indebt2Leah Good,control
32399490,ace96f1f11,@GoddessLeah10 @indebt2Leah Too,control
32399491,ace96f1f11,@GoddessLeah10 @indebt2Leah Is,control
32399492,ace96f1f11,@GoddessLeah10 @indebt2Leah Unlimited,control


In [8]:
sample = control_tweets.sample(10)

In [13]:
sample

Unnamed: 0,user_id,text,disorder
12103244,8de516754d,whats up twitter dot com the funny person is back,control
23591119,4c0a0981dd,@OfficialMonstaX omg cutie,control
13200692,92ff1eed3f,I knew I was washed when I fell asleep in the ...,control
106612,b8af0c69e1,santa bf is on his way to boys love (bimboland...,control
15250249,ffce1165aa,So we gotta fight https://t.co/9wqcKfDsQA,control
2405346,502a353ced,@diamondsigma ugh i’m so sorry at least u don’...,control
23120641,c0cd8ae189,#RIPBrodieLee 😭😭😭 https://t.co/wndeRHzXCy,control
26303054,d57d2791dd,@tetegalway Hello @bts_bighit @BigHitEnt and ...,control
14798059,2b68c6bc0e,@felipeprior 1M DO PRIOR NO TT\n1M DO PRIOR NO...,control
24561657,851d711041,Just vibin rn https://t.co/UGNG1mkTtN,control


In [14]:
def remove_mentions(text):
    if isinstance(text, str):
        return re.sub(r'@\w+', '', text)
    return text

In [15]:
def transform_emojis(text):
    if isinstance(text, str):
        return emoji.demojize(text)
    return text

In [16]:
def remove_stopwords(text):
    if isinstance(text, str):
        return ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [17]:
def normalize_text(text):
    if isinstance(text, str):
        return text.lower()
    return text

In [18]:
def remove_urls(text):
    if isinstance(text, str):
        return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    return text

In [19]:
def correct_spellings(text):
    if isinstance(text, str):
        sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        sym_spell.load_dictionary(DICT_PATH, term_index=0, count_index=1)
        suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
        return suggestions[0].term if suggestions else text

In [20]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default
    

def lemmatize_text(text):
    if isinstance(text, str):
        lemmatizer = WordNetLemmatizer()
        tokens = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)
        lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
        return ' '.join(lemmatized_words)
    return text

In [24]:
def process_text(df):
    copy = df.copy()

    functions = [
        remove_urls, 
        remove_mentions, 
        transform_emojis, 
        normalize_text, 
        remove_stopwords, 
        lemmatize_text
    ]

    for func in functions:
        copy['text'] = copy['text'].apply(func)

    return copy

In [25]:
sample

Unnamed: 0,user_id,text,disorder
12103244,8de516754d,whats up twitter dot com the funny person is back,control
23591119,4c0a0981dd,@OfficialMonstaX omg cutie,control
13200692,92ff1eed3f,I knew I was washed when I fell asleep in the ...,control
106612,b8af0c69e1,santa bf is on his way to boys love (bimboland...,control
15250249,ffce1165aa,So we gotta fight https://t.co/9wqcKfDsQA,control
2405346,502a353ced,@diamondsigma ugh i’m so sorry at least u don’...,control
23120641,c0cd8ae189,#RIPBrodieLee 😭😭😭 https://t.co/wndeRHzXCy,control
26303054,d57d2791dd,@tetegalway Hello @bts_bighit @BigHitEnt and ...,control
14798059,2b68c6bc0e,@felipeprior 1M DO PRIOR NO TT\n1M DO PRIOR NO...,control
24561657,851d711041,Just vibin rn https://t.co/UGNG1mkTtN,control


In [26]:
processed_sample = process_text(sample)
processed_sample

Unnamed: 0,user_id,text,disorder
12103244,8de516754d,whats twitter dot com funny person back,control
23591119,4c0a0981dd,omg cutie,control
13200692,92ff1eed3f,knew wash fell asleep club atlanta .,control
106612,b8af0c69e1,santa bf way boys love ( bimboland ),control
15250249,ffce1165aa,get ta fight,control
2405346,502a353ced,ugh i ’ m sorry least u don ’ t b-boys ask bru...,control
23120641,c0cd8ae189,# ripbrodielee : loudly_crying_face : :loudly_...,control
26303054,d57d2791dd,hello everyone concern . tire always report ma...,control
14798059,2b68c6bc0e,1m prior tt 1m prior tt 1m prior tt 1m prior t...,control
24561657,851d711041,vibin rn,control
