# Preproccesing the data

## Importing needed libraries

In [1]:
import pandas as pd
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

## Loading data

In [91]:
train_data = pd.read_csv("../../data/train_tweet.csv")
test_data = pd.read_csv("../../data/test_tweets.csv")

Take a first look at the data

In [92]:
train_data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [93]:
test_data.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


## Deal with user mentions


In [94]:
def count_user_mentions(text:str) ->int:
    return text.count("@user")
    

In [95]:
test_data["n_mentions"] = test_data["tweet"].apply(lambda x: count_user_mentions(x))
train_data["n_mentions"] = train_data["tweet"].apply(lambda x: count_user_mentions(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions
0,31963,#studiolife #aislife #requires #passion #dedic...,0
1,31964,@user #white #supremacists want everyone to s...,1
2,31965,safe ways to heal your #acne!! #altwaystohe...,0
3,31966,is the hp and the cursed child book up for res...,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0


## Deal with hashtags

In [96]:
def identify_hashtags(text:str) -> list:
    pattern = re.compile(r"#(\w+)")
    return pattern.findall(text)

In [97]:
test_data["hashtags"] = test_data["tweet"].apply(lambda x: identify_hashtags(x))
train_data["hashtags"] = train_data["tweet"].apply(lambda x: identify_hashtags(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]"


## Punctuation Removal

Create helper function

In [98]:
def remove_punctioation(text:str) -> str:
    return "".join([i for i in text if i not in punctuation])

In [99]:
test_data["without_puctioation"] = test_data["tweet"].apply(lambda x: remove_punctioation(x))
train_data["without_puctioation"] = train_data["tweet"].apply(lambda x: remove_punctioation(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...


In [100]:
train_data.head(10)

Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation
0,1,0,@user when a father is dysfunctional and is s...,1,[run],user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...,2,"[lyft, disapointed, getthanked]",user user thanks for lyft credit i cant use ca...
2,3,0,bihday your majesty,0,[],bihday your majesty
3,4,0,#model i love u take with u all the time in ...,0,[model],model i love u take with u all the time in u...
4,5,0,factsguide: society now #motivation,0,[motivation],factsguide society now motivation
5,6,0,[2/2] huge fan fare and big talking before the...,0,[allshowandnogo],22 huge fan fare and big talking before they l...
6,7,0,@user camping tomorrow @user @user @user @use...,8,[],user camping tomorrow user user user user use...
7,8,0,the next school year is the year for exams.ð...,0,"[school, exams, hate, imagine, actorslife, rev...",the next school year is the year for examsð¯...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,0,"[allin, cavs, champions, cleveland, clevelandc...",we won love the land allin cavs champions clev...
9,10,0,@user @user welcome here ! i'm it's so #gr...,2,[gr8],user user welcome here im its so gr8


## Lowering text 

In [102]:
test_data["tweet_lower"] = test_data["without_puctioation"].apply(lambda x: x.lower())
train_data["tweet_lower"] = train_data["without_puctioation"].apply(lambda x: x.lower())
train_data.head()

Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation,tweet_lower
0,1,0,@user when a father is dysfunctional and is s...,1,[run],user when a father is dysfunctional and is so...,user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...,2,"[lyft, disapointed, getthanked]",user user thanks for lyft credit i cant use ca...,user user thanks for lyft credit i cant use ca...
2,3,0,bihday your majesty,0,[],bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,0,[model],model i love u take with u all the time in u...,model i love u take with u all the time in u...
4,5,0,factsguide: society now #motivation,0,[motivation],factsguide society now motivation,factsguide society now motivation


## Tokenization

In [120]:
def tokenization(text:str) -> list:
    return nltk.word_tokenize(text)

In [123]:
test_data["tweet_token"] = test_data["tweet_lower"].apply(lambda x: tokenization(x))
train_data["tweet_token"] = train_data["tweet_lower"].apply(lambda x: tokenization(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[user, white, supremacists, want, everyone, to..."
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[safe, ways, to, heal, your, acne, altwaystohe..."
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[is, the, hp, and, the, cursed, child, book, u..."
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[3rd, bihday, to, my, amazing, hilarious, neph..."


## Remove Stopwords

In [131]:
def remove_stopwords(tokens) ->list:
    stopwords_list = stopwords.words("english")
    return [token for token in tokens if token not in stopwords_list]

In [133]:
test_data["clean_token"] = test_data["tweet_token"].apply(lambda x: remove_stopwords(x))
train_data["clean_token"] = train_data["tweet_token"].apply(lambda x: remove_stopwords(x))
test_data["clean_hashtags"] = test_data["hashtags"].apply(lambda x: remove_stopwords(x))
train_data["clean_hashtags"] = train_data["hashtags"].apply(lambda x: remove_stopwords(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[aislife, passion, willpower, find]","[aislife, passion, willpower, find]","[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[white, want, to, the, â, movie, and, why]","[white, want, â, movie]","[white, supremacists, birdsâ, movie]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[ways, heal, acne, healthy]","[ways, heal, acne, healthy]","[acne, altwaystoheal, healthy, healing]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[and, the, cursed, book, for, already, yes, if...","[cursed, book, already, yes, harrypotter, favo...","[harrypotter, pottermore, favorite]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[bihday, my, hilarious, eli, uncle, loves, and]","[bihday, hilarious, eli, uncle, loves]","[bihday, nephew]"


## Stemming

In [134]:
porter_stemmer = PorterStemmer()

def stemming(text:list) -> list:
    return [porter_stemmer.stem(word) for word in text]


In [135]:
test_data["stemmed_tokens"] = test_data["clean_token"].apply(lambda x: stemming(x))
train_data["stemmed_tokens"] = train_data["clean_token"].apply(lambda x: stemming(x))
test_data["stemmed_hashtags"] = test_data["clean_hashtags"].apply(lambda x: stemming(x))
train_data["stemmed_hashtags"] = train_data["clean_hashtags"].apply(lambda x: stemming(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[aislife, passion, willpower, find]","[aislife, passion, willpower, find]","[studiolife, aislife, requires, passion, dedic...","[aislif, passion, willpow, find]","[studiolif, aislif, requir, passion, dedic, wi..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[white, want, to, the, â, movie, and, why]","[white, want, â, movie]","[white, supremacists, birdsâ, movie]","[white, want, â, movi]","[white, supremacist, birdsâ, movi]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[ways, heal, acne, healthy]","[ways, heal, acne, healthy]","[acne, altwaystoheal, healthy, healing]","[way, heal, acn, healthi]","[acn, altwaystoh, healthi, heal]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[and, the, cursed, book, for, already, yes, if...","[cursed, book, already, yes, harrypotter, favo...","[harrypotter, pottermore, favorite]","[curs, book, alreadi, ye, harrypott, favorit]","[harrypott, pottermor, favorit]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[bihday, my, hilarious, eli, uncle, loves, and]","[bihday, hilarious, eli, uncle, loves]","[bihday, nephew]","[bihday, hilari, eli, uncl, love]","[bihday, nephew]"


Result does not look great (e.g. movie -> movi)

## Lemmatization

In [137]:
word_lemmatizer = WordNetLemmatizer()
def lemmatizer(text: list) -> list:
    return [word_lemmatizer.lemmatize(word) for word in text]

In [140]:
test_data["lemmatized_tokens"] = test_data["clean_token"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_tokens"] = train_data["clean_token"].apply(lambda x: lemmatizer(x))
test_data["lemmatized_hashtags"] = test_data["clean_hashtags"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_hashtags"] = train_data["clean_hashtags"].apply(lambda x: lemmatizer(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[aislife, passion, willpower, find]","[aislife, passion, willpower, find]","[studiolife, aislife, requires, passion, dedic...","[aislif, passion, willpow, find]","[studiolif, aislif, requir, passion, dedic, wi...","[aislife, passion, willpower, find]","[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[white, want, to, the, â, movie, and, why]","[white, want, â, movie]","[white, supremacists, birdsâ, movie]","[white, want, â, movi]","[white, supremacist, birdsâ, movi]","[white, want, â, movie]","[white, supremacist, birdsâ, movie]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[ways, heal, acne, healthy]","[ways, heal, acne, healthy]","[acne, altwaystoheal, healthy, healing]","[way, heal, acn, healthi]","[acn, altwaystoh, healthi, heal]","[way, heal, acne, healthy]","[acne, altwaystoheal, healthy, healing]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[and, the, cursed, book, for, already, yes, if...","[cursed, book, already, yes, harrypotter, favo...","[harrypotter, pottermore, favorite]","[curs, book, alreadi, ye, harrypott, favorit]","[harrypott, pottermor, favorit]","[cursed, book, already, yes, harrypotter, favo...","[harrypotter, pottermore, favorite]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[bihday, my, hilarious, eli, uncle, loves, and]","[bihday, hilarious, eli, uncle, loves]","[bihday, nephew]","[bihday, hilari, eli, uncl, love]","[bihday, nephew]","[bihday, hilarious, eli, uncle, love]","[bihday, nephew]"


## Export


In [14]:
test_data.to_csv("../../data/220502_test_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
train_data.to_csv("../../data/220502_train_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)

NameError: name 'test_data' is not defined

## Tfidf

In [2]:
import texthero as hero

train_data = pd.read_csv("../../data/220502_train_data_preprocessed.csv", sep=';')
train_data.head()
train_data["tfidf_stemmed_tokens"] = (hero.tfidf(train_data["stemmed_tokens"], max_features=8000))
train_data["tfidf_stemmed_tokens"] = (hero.tsne(train_data["tfidf_stemmed_tokens"]))

In [3]:
train_data["tfidf_stemmed_hashtags"] = (hero.tfidf(train_data["stemmed_hashtags"], max_features=8000))
train_data["tfidf_stemmed_hashtags"] = (hero.tsne(train_data["tfidf_stemmed_hashtags"]))

In [4]:
train_data["tfidf_lemmatized_tokens"] = (hero.tfidf(train_data["lemmatized_tokens"], max_features=8000))
train_data["tfidf_lemmatized_tokens"] = (hero.tsne(train_data["tfidf_lemmatized_tokens"]))

In [5]:
train_data["tfidf_lemmatized_hashtags"] = (hero.tfidf(train_data["lemmatized_hashtags"], max_features=8000))
train_data["tfidf_lemmatized_hashtags"] = (hero.tsne(train_data["tfidf_lemmatized_hashtags"]))

In [6]:
train_data


Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags,tfidf_stemmed_tokens,tfidf_stemmed_hashtags,tfidf_lemmatized_tokens,tfidf_lemmatized_hashtags
0,1,0,@user when a father is dysfunctional and is s...,1,['run'],user when a father is dysfunctional and is so...,user when a father is dysfunctional and is so...,"['when', 'father', 'dysfunctional', 'is', 'sel...","['father', 'dysfunctional', 'selfish', 'drags'...",['run'],"['father', 'dysfunct', 'selfish', 'drag', 'kid...",['run'],"['father', 'dysfunctional', 'selfish', 'drag',...",['run'],"[46.213226318359375, 40.86911392211914]","[14.551663398742676, -24.269515991210938]","[-78.26954650878906, 28.642547607421875]","[9.827388763427734, -1.2510194778442383]"
1,2,0,@user @user thanks for #lyft credit i can't us...,2,"['lyft', 'disapointed', 'getthanked']",user user thanks for lyft credit i cant use ca...,user user thanks for lyft credit i cant use ca...,"['user', 'for', 'credit', 'cant', 'cause', 'do...","['user', 'credit', 'cant', 'cause', 'dont', 'w...","['lyft', 'disapointed', 'getthanked']","['user', 'credit', 'cant', 'caus', 'dont', 'wh...","['lyft', 'disapoint', 'getthank']","['user', 'credit', 'cant', 'cause', 'dont', 'w...","['lyft', 'disapointed', 'getthanked']","[-55.80070495605469, -38.61825942993164]","[2.428388833999634, -6.956750869750977]","[-23.937767028808594, -82.70685577392578]","[-2.348551034927368, -3.923227071762085]"
2,3,0,bihday your majesty,0,[],bihday your majesty,bihday your majesty,['your'],[],[],[],[],[],[],"[-26.814523696899414, -47.92416763305664]","[-18.723024368286133, 16.274951934814453]","[-16.838232040405273, -55.85787582397461]","[-0.12042795121669769, 11.407605171203613]"
3,4,0,#model i love u take with u all the time in ...,0,['model'],model i love u take with u all the time in u...,model i love u take with u all the time in u...,"['i', 'with', 'u', 'all', 'time', 'urð\x9f\x93...","['u', 'time', 'urð\x9f\x93±', 'ð\x9f\x92¦ð\x9f...",['model'],"['u', 'time', 'urð\x9f\x93±', 'ð\x9f\x92¦ð\x9f...",['model'],"['u', 'time', 'urð\x9f\x93±', 'ð\x9f\x92¦ð\x9f...",['model'],"[18.343618392944336, 52.06433868408203]","[-44.3891716003418, -0.5640219449996948]","[21.397464752197266, -56.25738525390625]","[-2.5464980602264404, 29.163206100463867]"
4,5,0,factsguide: society now #motivation,0,['motivation'],factsguide society now motivation,factsguide society now motivation,"['society', 'motivation']","['society', 'motivation']",['motivation'],"['societi', 'motiv']",['motiv'],"['society', 'motivation']",['motivation'],"[18.323801040649414, -18.519189834594727]","[-35.41141128540039, 38.93525314331055]","[11.912168502807617, 15.544480323791504]","[22.646465301513672, 20.2937068939209]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...,1,[],ate user isz that youuuðððððð...,ate user isz that youuuðððððð...,"['user', 'that']",['user'],[],['user'],[],['user'],[],"[58.34431838989258, 15.119406700134277]","[-18.723024368286133, 16.274951934814453]","[52.27455520629883, -32.16963577270508]","[-0.12042795121669769, 11.407605171203613]"
31958,31959,0,to see nina turner on the airwaves trying to...,0,"['shame', 'imwithher']",to see nina turner on the airwaves trying to...,to see nina turner on the airwaves trying to...,"['see', 'turner', 'trying', 'wrap', 'in', 'the...","['see', 'turner', 'trying', 'wrap', 'mantle', ...","['shame', 'imwithher']","['see', 'turner', 'tri', 'wrap', 'mantl', 'her...","['shame', 'imwithh']","['see', 'turner', 'trying', 'wrap', 'mantle', ...","['shame', 'imwithher']","[-76.02751922607422, 11.04662799835205]","[5.823639392852783, -36.403419494628906]","[22.713640213012695, -4.954850196838379]","[9.027178764343262, 5.0646653175354]"
31959,31960,0,listening to sad songs on a monday morning otw...,0,[],listening to sad songs on a monday morning otw...,listening to sad songs on a monday morning otw...,"['to', 'songs', 'a', 'morning', 'to', 'is']","['songs', 'morning']",[],"['song', 'morn']",[],"['song', 'morning']",[],"[-5.372385501861572, -38.51631164550781]","[-18.723024368286133, 16.274951934814453]","[-8.667880058288574, 29.219118118286133]","[-0.12042795121669769, 11.407605171203613]"
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...",1,"['sikh', 'temple', 'calgary', 'wso']",user sikh temple vandalised in in calgary wso ...,user sikh temple vandalised in in calgary wso ...,"['sikh', 'vandalised', 'in', 'wso', 'act']","['sikh', 'vandalised', 'wso', 'act']","['sikh', 'temple', 'calgary', 'wso']","['sikh', 'vandalis', 'wso', 'act']","['sikh', 'templ', 'calgari', 'wso']","['sikh', 'vandalised', 'wso', 'act']","['sikh', 'temple', 'calgary', 'wso']","[-34.529258728027344, -22.818838119506836]","[-34.69069290161133, -22.710052490234375]","[-23.177236557006836, -36.8807258605957]","[-24.578514099121094, -8.35556411743164]"


## Split Data

In [11]:
X = train_data.loc[:, train_data.columns != "label"]
Y = train_data.loc[train_data.label]
X


Unnamed: 0,id,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags,tfidf_stemmed_tokens,tfidf_stemmed_hashtags,tfidf_lemmatized_tokens,tfidf_lemmatized_hashtags
0,1,@user when a father is dysfunctional and is s...,1,['run'],user when a father is dysfunctional and is so...,user when a father is dysfunctional and is so...,"['when', 'father', 'dysfunctional', 'is', 'sel...","['father', 'dysfunctional', 'selfish', 'drags'...",['run'],"['father', 'dysfunct', 'selfish', 'drag', 'kid...",['run'],"['father', 'dysfunctional', 'selfish', 'drag',...",['run'],"[46.213226318359375, 40.86911392211914]","[14.551663398742676, -24.269515991210938]","[-78.26954650878906, 28.642547607421875]","[9.827388763427734, -1.2510194778442383]"
1,2,@user @user thanks for #lyft credit i can't us...,2,"['lyft', 'disapointed', 'getthanked']",user user thanks for lyft credit i cant use ca...,user user thanks for lyft credit i cant use ca...,"['user', 'for', 'credit', 'cant', 'cause', 'do...","['user', 'credit', 'cant', 'cause', 'dont', 'w...","['lyft', 'disapointed', 'getthanked']","['user', 'credit', 'cant', 'caus', 'dont', 'wh...","['lyft', 'disapoint', 'getthank']","['user', 'credit', 'cant', 'cause', 'dont', 'w...","['lyft', 'disapointed', 'getthanked']","[-55.80070495605469, -38.61825942993164]","[2.428388833999634, -6.956750869750977]","[-23.937767028808594, -82.70685577392578]","[-2.348551034927368, -3.923227071762085]"
2,3,bihday your majesty,0,[],bihday your majesty,bihday your majesty,['your'],[],[],[],[],[],[],"[-26.814523696899414, -47.92416763305664]","[-18.723024368286133, 16.274951934814453]","[-16.838232040405273, -55.85787582397461]","[-0.12042795121669769, 11.407605171203613]"
3,4,#model i love u take with u all the time in ...,0,['model'],model i love u take with u all the time in u...,model i love u take with u all the time in u...,"['i', 'with', 'u', 'all', 'time', 'urð\x9f\x93...","['u', 'time', 'urð\x9f\x93±', 'ð\x9f\x92¦ð\x9f...",['model'],"['u', 'time', 'urð\x9f\x93±', 'ð\x9f\x92¦ð\x9f...",['model'],"['u', 'time', 'urð\x9f\x93±', 'ð\x9f\x92¦ð\x9f...",['model'],"[18.343618392944336, 52.06433868408203]","[-44.3891716003418, -0.5640219449996948]","[21.397464752197266, -56.25738525390625]","[-2.5464980602264404, 29.163206100463867]"
4,5,factsguide: society now #motivation,0,['motivation'],factsguide society now motivation,factsguide society now motivation,"['society', 'motivation']","['society', 'motivation']",['motivation'],"['societi', 'motiv']",['motiv'],"['society', 'motivation']",['motivation'],"[18.323801040649414, -18.519189834594727]","[-35.41141128540039, 38.93525314331055]","[11.912168502807617, 15.544480323791504]","[22.646465301513672, 20.2937068939209]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31957,31958,ate @user isz that youuu?ðððððð...,1,[],ate user isz that youuuðððððð...,ate user isz that youuuðððððð...,"['user', 'that']",['user'],[],['user'],[],['user'],[],"[58.34431838989258, 15.119406700134277]","[-18.723024368286133, 16.274951934814453]","[52.27455520629883, -32.16963577270508]","[-0.12042795121669769, 11.407605171203613]"
31958,31959,to see nina turner on the airwaves trying to...,0,"['shame', 'imwithher']",to see nina turner on the airwaves trying to...,to see nina turner on the airwaves trying to...,"['see', 'turner', 'trying', 'wrap', 'in', 'the...","['see', 'turner', 'trying', 'wrap', 'mantle', ...","['shame', 'imwithher']","['see', 'turner', 'tri', 'wrap', 'mantl', 'her...","['shame', 'imwithh']","['see', 'turner', 'trying', 'wrap', 'mantle', ...","['shame', 'imwithher']","[-76.02751922607422, 11.04662799835205]","[5.823639392852783, -36.403419494628906]","[22.713640213012695, -4.954850196838379]","[9.027178764343262, 5.0646653175354]"
31959,31960,listening to sad songs on a monday morning otw...,0,[],listening to sad songs on a monday morning otw...,listening to sad songs on a monday morning otw...,"['to', 'songs', 'a', 'morning', 'to', 'is']","['songs', 'morning']",[],"['song', 'morn']",[],"['song', 'morning']",[],"[-5.372385501861572, -38.51631164550781]","[-18.723024368286133, 16.274951934814453]","[-8.667880058288574, 29.219118118286133]","[-0.12042795121669769, 11.407605171203613]"
31960,31961,"@user #sikh #temple vandalised in in #calgary,...",1,"['sikh', 'temple', 'calgary', 'wso']",user sikh temple vandalised in in calgary wso ...,user sikh temple vandalised in in calgary wso ...,"['sikh', 'vandalised', 'in', 'wso', 'act']","['sikh', 'vandalised', 'wso', 'act']","['sikh', 'temple', 'calgary', 'wso']","['sikh', 'vandalis', 'wso', 'act']","['sikh', 'templ', 'calgari', 'wso']","['sikh', 'vandalised', 'wso', 'act']","['sikh', 'temple', 'calgary', 'wso']","[-34.529258728027344, -22.818838119506836]","[-34.69069290161133, -22.710052490234375]","[-23.177236557006836, -36.8807258605957]","[-24.578514099121094, -8.35556411743164]"


In [12]:
X_train, X_test, y_train, y_test = train_test_split(train_data, Y, test_size=0.2, random_state=55)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=55)

In [14]:
X_train["label"] = y_train.label
X_test["label"] = y_test.label
X_val["label"] = y_val.label

  X_train["label"] = y_train.label


ValueError: cannot reindex on an axis with duplicate labels

In [16]:
X_test

Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags,tfidf_stemmed_tokens,tfidf_stemmed_hashtags,tfidf_lemmatized_tokens,tfidf_lemmatized_hashtags
24089,24090,0,best #lawofattraction #resources for #healing!...,0,"['lawofattraction', 'resources', 'healing', 'a...",best lawofattraction resources for healing ...,best lawofattraction resources for healing ...,"['lawofattraction', 'for', 'altwaystoheal', 'is']","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resources', 'healing', 'a...","['lawofattract', 'altwaystoh']","['lawofattract', 'resourc', 'heal', 'altwaysto...","['lawofattraction', 'altwaystoheal']","['lawofattraction', 'resource', 'healing', 'al...","[-54.797821044921875, 19.326093673706055]","[50.279502868652344, -0.368119478225708]","[30.39127540588379, -49.107627868652344]","[8.619098663330078, -19.02434539794922]"
15263,15264,0,remembering to focus on the simplest happy mom...,0,"['blogger', 'blog', 'life']",remembering to focus on the simplest happy mom...,remembering to focus on the simplest happy mom...,"['to', 'on', 'simplest', 'moments', 'life', 'b...","['simplest', 'moments', 'life', 'blogger', 'li...","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","['simplest', 'moment', 'life', 'blogger', 'life']","['blogger', 'blog', 'life']","[12.769611358642578, 38.40631866455078]","[-0.20247356593608856, -12.449955940246582]","[-25.304161071777344, -11.912176132202148]","[-5.459761619567871, -11.88219928741455]"
19309,19310,0,when you get as happy as your boyfriend to be ...,0,['silvia'],when you get as happy as your boyfriend to be ...,when you get as happy as your boyfriend to be ...,"['you', 'as', 'as', 'boyfriend', 'be', 'with',...","['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"['boyfriend', 'car']",['silvia'],"[15.18979263305664, -9.596672058105469]","[10.67345905303955, -4.716570854187012]","[80.47900390625, 3.222534656524658]","[-2.534292459487915, -6.132740020751953]"
27243,27244,0,why do you always try to make me happy? i don...,0,"['love', 'devotion']",why do you always try to make me happy i dont...,why do you always try to make me happy i dont...,"['do', 'always', 'to', 'me', 'i', 'know', 'to'...","['always', 'know', 'love']","['love', 'devotion']","['alway', 'know', 'love']","['love', 'devot']","['always', 'know', 'love']","['love', 'devotion']","[-46.13848876953125, -18.032955169677734]","[11.31847858428955, -16.98657989501953]","[40.34811019897461, -24.527305603027344]","[8.227179527282715, -13.818502426147461]"
6632,6633,0,omg is finally here!!! #ps4 #farcry4 #gtav #un...,0,"['ps4', 'farcry4', 'gtav', 'unchaed4']",omg is finally here ps4 farcry4 gtav unchaed4,omg is finally here ps4 farcry4 gtav unchaed4,"['is', 'here', 'farcry4', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","['farcry4', 'unchaed4']","['ps4', 'farcry4', 'gtav', 'unchaed4']","[3.866750478744507, -5.09706449508667]","[21.51004409790039, 15.360187530517578]","[8.554491996765137, -20.749971389770508]","[-6.128843307495117, -11.832077980041504]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9039,9040,0,#cat #kitty select knowledge:,0,"['cat', 'kitty']",cat kitty select knowledge,cat kitty select knowledge,"['kitty', 'knowledge']","['kitty', 'knowledge']","['cat', 'kitty']","['kitti', 'knowledg']","['cat', 'kitti']","['kitty', 'knowledge']","['cat', 'kitty']","[-48.76616668701172, 1.9193642139434814]","[8.789885520935059, 21.543291091918945]","[17.61883544921875, -40.230289459228516]","[3.1132328510284424, -5.80885124206543]"
10615,10616,0,@user shays first camp @user #clawosseum #c...,2,"['clawosseum', 'clawsout']",user shays first camp user clawosseum claws...,user shays first camp user clawosseum claws...,"['shays', 'camp', 'clawosseum']","['shays', 'camp', 'clawosseum']","['clawosseum', 'clawsout']","['shay', 'camp', 'clawosseum']","['clawosseum', 'clawsout']","['shay', 'camp', 'clawosseum']","['clawosseum', 'clawsout']","[22.903425216674805, -5.662053108215332]","[-0.957913339138031, 13.914173126220703]","[-13.517252922058105, 6.790152072906494]","[1.1501950025558472, 0.8506528735160828]"
12034,12035,0,@user @user yes yes yes!,2,[],user user yes yes yes,user user yes yes yes,"['user', 'yes']","['user', 'yes']",[],"['user', 'ye']",[],"['user', 'yes']",[],"[-11.248979568481445, 19.911727905273438]","[-18.723024368286133, 16.274951934814453]","[12.155923843383789, -3.0542140007019043]","[-0.12042795121669769, 11.407605171203613]"
10800,10801,0,i'm so and #grateful now that - #affirmations,0,"['grateful', 'affirmations']",im so and grateful now that affirmations,im so and grateful now that affirmations,"['so', 'grateful', 'that']",['grateful'],"['grateful', 'affirmations']",['grate'],"['grate', 'affirm']",['grateful'],"['grateful', 'affirmation']","[16.90058708190918, -54.99842834472656]","[-27.510984420776367, -36.945865631103516]","[56.24428939819336, 16.980188369750977]","[-10.509261131286621, 5.6819562911987305]"


In [None]:
X_train.to_csv("../../data/220505_test_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
X_test.to_csv("../../data/220505_train_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
X_val.to_csv("../../data/220505_validation_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)

# Work in progress

## Work on emojis 
Convert emojis to their corresponding text

In [63]:
text = "#model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  "
test = b'#model   i love u take with u all the time in ur\xc3\xb0\xc2\x9f\xc2\x93\xc2\xb1!!! \xc3\xb0\xc2\x9f\xc2\x98\xc2\x99\xc3\xb0\xc2\x9f\xc2\x98\xc2\x8e\xc3\xb0\xc2\x9f\xc2\x91\xc2\x84\xc3\xb0\xc2\x9f\xc2\x91\xc2\x85\xc3\xb0\xc2\x9f\xc2\x92\xc2\xa6\xc3\xb0\xc2\x9f\xc2\x92\xc2\xa6\xc3\xb0\xc2\x9f\xc2\x92\xc2\xa6  '

test.decode('utf-8')

  tokens_by_line = make_tokens_by_line(lines)


'#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  '

In [32]:
test = "#model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  "
print(emot.emoji(test))

print(test)

  tokens_by_line = make_tokens_by_line(lines)


AttributeError: module 'emot' has no attribute 'emoji'