# Preproccesing the data

## Importing needed libraries

In [112]:
import pandas as pd
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

## Loading data

In [91]:
train_data = pd.read_csv("../../data/train_tweet.csv")
test_data = pd.read_csv("../../data/test_tweets.csv")

Take a first look at the data

In [92]:
train_data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [93]:
test_data.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


## Deal with user mentions


In [94]:
def count_user_mentions(text:str) ->int:
    return text.count("@user")
    

In [95]:
test_data["n_mentions"] = test_data["tweet"].apply(lambda x: count_user_mentions(x))
train_data["n_mentions"] = train_data["tweet"].apply(lambda x: count_user_mentions(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions
0,31963,#studiolife #aislife #requires #passion #dedic...,0
1,31964,@user #white #supremacists want everyone to s...,1
2,31965,safe ways to heal your #acne!! #altwaystohe...,0
3,31966,is the hp and the cursed child book up for res...,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0


## Deal with hashtags

In [96]:
def identify_hashtags(text:str) -> list:
    pattern = re.compile(r"#(\w+)")
    return pattern.findall(text)

In [97]:
test_data["hashtags"] = test_data["tweet"].apply(lambda x: identify_hashtags(x))
train_data["hashtags"] = train_data["tweet"].apply(lambda x: identify_hashtags(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]"


## Punctuation Removal

Create helper function

In [98]:
def remove_punctioation(text:str) -> str:
    return "".join([i for i in text if i not in punctuation])

In [99]:
test_data["without_puctioation"] = test_data["tweet"].apply(lambda x: remove_punctioation(x))
train_data["without_puctioation"] = train_data["tweet"].apply(lambda x: remove_punctioation(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...


In [100]:
train_data.head(10)

Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation
0,1,0,@user when a father is dysfunctional and is s...,1,[run],user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...,2,"[lyft, disapointed, getthanked]",user user thanks for lyft credit i cant use ca...
2,3,0,bihday your majesty,0,[],bihday your majesty
3,4,0,#model i love u take with u all the time in ...,0,[model],model i love u take with u all the time in u...
4,5,0,factsguide: society now #motivation,0,[motivation],factsguide society now motivation
5,6,0,[2/2] huge fan fare and big talking before the...,0,[allshowandnogo],22 huge fan fare and big talking before they l...
6,7,0,@user camping tomorrow @user @user @user @use...,8,[],user camping tomorrow user user user user use...
7,8,0,the next school year is the year for exams.ð...,0,"[school, exams, hate, imagine, actorslife, rev...",the next school year is the year for examsð¯...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,0,"[allin, cavs, champions, cleveland, clevelandc...",we won love the land allin cavs champions clev...
9,10,0,@user @user welcome here ! i'm it's so #gr...,2,[gr8],user user welcome here im its so gr8


## Lowering text 

In [102]:
test_data["tweet_lower"] = test_data["without_puctioation"].apply(lambda x: x.lower())
train_data["tweet_lower"] = train_data["without_puctioation"].apply(lambda x: x.lower())
train_data.head()

Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation,tweet_lower
0,1,0,@user when a father is dysfunctional and is s...,1,[run],user when a father is dysfunctional and is so...,user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...,2,"[lyft, disapointed, getthanked]",user user thanks for lyft credit i cant use ca...,user user thanks for lyft credit i cant use ca...
2,3,0,bihday your majesty,0,[],bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,0,[model],model i love u take with u all the time in u...,model i love u take with u all the time in u...
4,5,0,factsguide: society now #motivation,0,[motivation],factsguide society now motivation,factsguide society now motivation


## Tokenization

In [120]:
def tokenization(text:str) -> list:
    return nltk.word_tokenize(text)

In [123]:
test_data["tweet_token"] = test_data["tweet_lower"].apply(lambda x: tokenization(x))
train_data["tweet_token"] = train_data["tweet_lower"].apply(lambda x: tokenization(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[user, white, supremacists, want, everyone, to..."
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[safe, ways, to, heal, your, acne, altwaystohe..."
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[is, the, hp, and, the, cursed, child, book, u..."
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[3rd, bihday, to, my, amazing, hilarious, neph..."


## Remove Stopwords

In [131]:
def remove_stopwords(tokens) ->list:
    stopwords_list = stopwords.words("english")
    return [token for token in tokens if token not in stopwords_list]

In [133]:
test_data["clean_token"] = test_data["tweet_token"].apply(lambda x: remove_stopwords(x))
train_data["clean_token"] = train_data["tweet_token"].apply(lambda x: remove_stopwords(x))
test_data["clean_hashtags"] = test_data["hashtags"].apply(lambda x: remove_stopwords(x))
train_data["clean_hashtags"] = train_data["hashtags"].apply(lambda x: remove_stopwords(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[aislife, passion, willpower, find]","[aislife, passion, willpower, find]","[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[white, want, to, the, â, movie, and, why]","[white, want, â, movie]","[white, supremacists, birdsâ, movie]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[ways, heal, acne, healthy]","[ways, heal, acne, healthy]","[acne, altwaystoheal, healthy, healing]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[and, the, cursed, book, for, already, yes, if...","[cursed, book, already, yes, harrypotter, favo...","[harrypotter, pottermore, favorite]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[bihday, my, hilarious, eli, uncle, loves, and]","[bihday, hilarious, eli, uncle, loves]","[bihday, nephew]"


## Stemming

In [134]:
porter_stemmer = PorterStemmer()

def stemming(text:list) -> list:
    return [porter_stemmer.stem(word) for word in text]


In [135]:
test_data["stemmed_tokens"] = test_data["clean_token"].apply(lambda x: stemming(x))
train_data["stemmed_tokens"] = train_data["clean_token"].apply(lambda x: stemming(x))
test_data["stemmed_hashtags"] = test_data["clean_hashtags"].apply(lambda x: stemming(x))
train_data["stemmed_hashtags"] = train_data["clean_hashtags"].apply(lambda x: stemming(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[aislife, passion, willpower, find]","[aislife, passion, willpower, find]","[studiolife, aislife, requires, passion, dedic...","[aislif, passion, willpow, find]","[studiolif, aislif, requir, passion, dedic, wi..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[white, want, to, the, â, movie, and, why]","[white, want, â, movie]","[white, supremacists, birdsâ, movie]","[white, want, â, movi]","[white, supremacist, birdsâ, movi]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[ways, heal, acne, healthy]","[ways, heal, acne, healthy]","[acne, altwaystoheal, healthy, healing]","[way, heal, acn, healthi]","[acn, altwaystoh, healthi, heal]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[and, the, cursed, book, for, already, yes, if...","[cursed, book, already, yes, harrypotter, favo...","[harrypotter, pottermore, favorite]","[curs, book, alreadi, ye, harrypott, favorit]","[harrypott, pottermor, favorit]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[bihday, my, hilarious, eli, uncle, loves, and]","[bihday, hilarious, eli, uncle, loves]","[bihday, nephew]","[bihday, hilari, eli, uncl, love]","[bihday, nephew]"


Result does not look great (e.g. movie -> movi)

## Lemmatization

In [137]:
word_lemmatizer = WordNetLemmatizer()
def lemmatizer(text: list) -> list:
    return [word_lemmatizer.lemmatize(word) for word in text]

In [140]:
test_data["lemmatized_tokens"] = test_data["clean_token"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_tokens"] = train_data["clean_token"].apply(lambda x: lemmatizer(x))
test_data["lemmatized_hashtags"] = test_data["clean_hashtags"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_hashtags"] = train_data["clean_hashtags"].apply(lambda x: lemmatizer(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[aislife, passion, willpower, find]","[aislife, passion, willpower, find]","[studiolife, aislife, requires, passion, dedic...","[aislif, passion, willpow, find]","[studiolif, aislif, requir, passion, dedic, wi...","[aislife, passion, willpower, find]","[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[white, want, to, the, â, movie, and, why]","[white, want, â, movie]","[white, supremacists, birdsâ, movie]","[white, want, â, movi]","[white, supremacist, birdsâ, movi]","[white, want, â, movie]","[white, supremacist, birdsâ, movie]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[ways, heal, acne, healthy]","[ways, heal, acne, healthy]","[acne, altwaystoheal, healthy, healing]","[way, heal, acn, healthi]","[acn, altwaystoh, healthi, heal]","[way, heal, acne, healthy]","[acne, altwaystoheal, healthy, healing]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[and, the, cursed, book, for, already, yes, if...","[cursed, book, already, yes, harrypotter, favo...","[harrypotter, pottermore, favorite]","[curs, book, alreadi, ye, harrypott, favorit]","[harrypott, pottermor, favorit]","[cursed, book, already, yes, harrypotter, favo...","[harrypotter, pottermore, favorite]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[bihday, my, hilarious, eli, uncle, loves, and]","[bihday, hilarious, eli, uncle, loves]","[bihday, nephew]","[bihday, hilari, eli, uncl, love]","[bihday, nephew]","[bihday, hilarious, eli, uncle, love]","[bihday, nephew]"


## Export


In [145]:
test_data.to_csv("../../data/220502_test_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
train_data.to_csv("../../data/220502_train_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)

# Work in progress

## Work on emojis 
Convert emojis to their corresponding text

In [63]:
text = "#model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  "
test = b'#model   i love u take with u all the time in ur\xc3\xb0\xc2\x9f\xc2\x93\xc2\xb1!!! \xc3\xb0\xc2\x9f\xc2\x98\xc2\x99\xc3\xb0\xc2\x9f\xc2\x98\xc2\x8e\xc3\xb0\xc2\x9f\xc2\x91\xc2\x84\xc3\xb0\xc2\x9f\xc2\x91\xc2\x85\xc3\xb0\xc2\x9f\xc2\x92\xc2\xa6\xc3\xb0\xc2\x9f\xc2\x92\xc2\xa6\xc3\xb0\xc2\x9f\xc2\x92\xc2\xa6  '

test.decode('utf-8')

  tokens_by_line = make_tokens_by_line(lines)


'#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  '

In [32]:
test = "#model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  "
print(emot.emoji(test))

print(test)

  tokens_by_line = make_tokens_by_line(lines)


AttributeError: module 'emot' has no attribute 'emoji'