In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/DataMining/Data_mining/src/data

/content/drive/MyDrive/DataMining/Data_mining/src/data


# Preproccesing the data

## Importing needed libraries

In [72]:
import pandas as pd
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import demoji


demoji.download_codes()

  demoji.download_codes()


## Loading data

In [2]:
train_data = pd.read_csv("../../data/train_tweet.csv")
test_data = pd.read_csv("../../data/test_tweets.csv")

Take a first look at the data

In [4]:
train_data.head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [None]:
test_data.head()

## Work on emojis 
Convert emojis to their corresponding text

In [49]:
def convert_emoji(text:str) -> str:
    #convert string to binary representation
    binary = ' '.join(format(ord(x), 'b') for x in text)

    #convert binary representation to utf8 representation
    listRes = list(binary.split(" "))
    try:
        text_with_emoji = bytes([int(x,2) for x in listRes]).decode('utf-8')
    except UnicodeDecodeError:
        return text
        
    #get all emojis
    dictionary = demoji.findall(text_with_emoji)

    #replace emojis with text representation
    for key in dictionary.keys():
        text_with_emoji = text_with_emoji.replace(key, dictionary[key] + " ")

    
    return text_with_emoji
    
    

In [50]:
train_data["tweet_converted_emojis"] = train_data["tweet"].apply(lambda x: convert_emoji(x))

In [51]:
train_data.head(10)

Unnamed: 0,id,label,tweet,tweet_converted_emojis
0,1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams.hus...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...,@user @user welcome here ! i'm it's so #gr...


## Deal with user mentions


In [52]:
def count_user_mentions(text:str) ->int:
    return text.count("@user")
    

In [53]:
test_data["n_mentions"] = test_data["tweet"].apply(lambda x: count_user_mentions(x))
train_data["n_mentions"] = train_data["tweet"].apply(lambda x: count_user_mentions(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions
0,31963,#studiolife #aislife #requires #passion #dedic...,0
1,31964,@user #white #supremacists want everyone to s...,1
2,31965,safe ways to heal your #acne!! #altwaystohe...,0
3,31966,is the hp and the cursed child book up for res...,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0


## Deal with hashtags

In [54]:
def identify_hashtags(text:str) -> list:
    pattern = re.compile(r"#(\w+)")
    return pattern.findall(text)

In [55]:
test_data["hashtags"] = test_data["tweet"].apply(lambda x: identify_hashtags(x))
train_data["hashtags"] = train_data["tweet"].apply(lambda x: identify_hashtags(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]"


## Punctuation Removal

Create helper function

In [56]:
def remove_punctuation(text:str) -> str:
    return "".join([i for i in text if i not in punctuation])

In [57]:
test_data["without_punctuation"] = test_data["tweet"].apply(lambda x: remove_punctuation(x))
train_data["without_punctuation"] = train_data["tweet"].apply(lambda x: remove_punctuation(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_punctuation
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...


In [58]:
train_data.head(10)

Unnamed: 0,id,label,tweet,tweet_converted_emojis,n_mentions,hashtags,without_punctuation
0,1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...,1,[run],user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...,2,"[lyft, disapointed, getthanked]",user user thanks for lyft credit i cant use ca...
2,3,0,bihday your majesty,bihday your majesty,0,[],bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...,0,[model],model i love u take with u all the time in u...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation,0,[motivation],factsguide society now motivation
5,6,0,[2/2] huge fan fare and big talking before the...,[2/2] huge fan fare and big talking before the...,0,[allshowandnogo],22 huge fan fare and big talking before they l...
6,7,0,@user camping tomorrow @user @user @user @use...,@user camping tomorrow @user @user @user @use...,8,[],user camping tomorrow user user user user use...
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams.hus...,0,"[school, exams, hate, imagine, actorslife, rev...",the next school year is the year for examsð¯...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won!!! love the land!!! #allin #cavs #champ...,0,"[allin, cavs, champions, cleveland, clevelandc...",we won love the land allin cavs champions clev...
9,10,0,@user @user welcome here ! i'm it's so #gr...,@user @user welcome here ! i'm it's so #gr...,2,[gr8],user user welcome here im its so gr8


## Lowering text 

In [59]:
test_data["tweet_lower"] = test_data["without_punctuation"].apply(lambda x: x.lower())
train_data["tweet_lower"] = train_data["without_punctuation"].apply(lambda x: x.lower())
train_data.head()

Unnamed: 0,id,label,tweet,tweet_converted_emojis,n_mentions,hashtags,without_punctuation,tweet_lower
0,1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...,1,[run],user when a father is dysfunctional and is so...,user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...,2,"[lyft, disapointed, getthanked]",user user thanks for lyft credit i cant use ca...,user user thanks for lyft credit i cant use ca...
2,3,0,bihday your majesty,bihday your majesty,0,[],bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...,0,[model],model i love u take with u all the time in u...,model i love u take with u all the time in u...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation,0,[motivation],factsguide society now motivation,factsguide society now motivation


## Tokenization

In [60]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aaronsteiner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
def tokenization(text:str) -> list:
    return nltk.word_tokenize(text)

In [62]:
test_data["tweet_token"] = test_data["tweet_lower"].apply(lambda x: tokenization(x))
train_data["tweet_token"] = train_data["tweet_lower"].apply(lambda x: tokenization(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_punctuation,tweet_lower,tweet_token
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[user, white, supremacists, want, everyone, to..."
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[safe, ways, to, heal, your, acne, altwaystohe..."
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[is, the, hp, and, the, cursed, child, book, u..."
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[3rd, bihday, to, my, amazing, hilarious, neph..."


## Remove Stopwords

In [63]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aaronsteiner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [64]:
def remove_stopwords(tokens) ->list:
    stopwords_list = stopwords.words("english")
    return [token for token in tokens if token not in stopwords_list]

In [65]:
test_data["clean_token"] = test_data["tweet_token"].apply(lambda x: remove_stopwords(x))
train_data["clean_token"] = train_data["tweet_token"].apply(lambda x: remove_stopwords(x))
test_data["clean_hashtags"] = test_data["hashtags"].apply(lambda x: remove_stopwords(x))
train_data["clean_hashtags"] = train_data["hashtags"].apply(lambda x: remove_stopwords(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_punctuation,tweet_lower,tweet_token,clean_token,clean_hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[studiolife, aislife, requires, passion, dedic...","[studiolife, aislife, requires, passion, dedic...","[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[user, white, supremacists, want, everyone, to...","[user, white, supremacists, want, everyone, se...","[white, supremacists, birdsâ, movie]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[safe, ways, to, heal, your, acne, altwaystohe...","[safe, ways, heal, acne, altwaystoheal, health...","[acne, altwaystoheal, healthy, healing]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[is, the, hp, and, the, cursed, child, book, u...","[hp, cursed, child, book, reservations, alread...","[harrypotter, pottermore, favorite]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[3rd, bihday, to, my, amazing, hilarious, neph...","[3rd, bihday, amazing, hilarious, nephew, eli,...","[bihday, nephew]"


## Stemming

In [66]:
porter_stemmer = PorterStemmer()

def stemming(text:list) -> list:
    return [porter_stemmer.stem(word) for word in text]


In [67]:
test_data["stemmed_tokens"] = test_data["clean_token"].apply(lambda x: stemming(x))
train_data["stemmed_tokens"] = train_data["clean_token"].apply(lambda x: stemming(x))
test_data["stemmed_hashtags"] = test_data["clean_hashtags"].apply(lambda x: stemming(x))
train_data["stemmed_hashtags"] = train_data["clean_hashtags"].apply(lambda x: stemming(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_punctuation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[studiolife, aislife, requires, passion, dedic...","[studiolife, aislife, requires, passion, dedic...","[studiolife, aislife, requires, passion, dedic...","[studiolif, aislif, requir, passion, dedic, wi...","[studiolif, aislif, requir, passion, dedic, wi..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[user, white, supremacists, want, everyone, to...","[user, white, supremacists, want, everyone, se...","[white, supremacists, birdsâ, movie]","[user, white, supremacist, want, everyon, see,...","[white, supremacist, birdsâ, movi]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[safe, ways, to, heal, your, acne, altwaystohe...","[safe, ways, heal, acne, altwaystoheal, health...","[acne, altwaystoheal, healthy, healing]","[safe, way, heal, acn, altwaystoh, healthi, heal]","[acn, altwaystoh, healthi, heal]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[is, the, hp, and, the, cursed, child, book, u...","[hp, cursed, child, book, reservations, alread...","[harrypotter, pottermore, favorite]","[hp, curs, child, book, reserv, alreadi, ye, ð...","[harrypott, pottermor, favorit]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[3rd, bihday, to, my, amazing, hilarious, neph...","[3rd, bihday, amazing, hilarious, nephew, eli,...","[bihday, nephew]","[3rd, bihday, amaz, hilari, nephew, eli, ahmir...","[bihday, nephew]"


Result does not look great (e.g. movie -> movi)

## Lemmatization

In [68]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aaronsteiner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [69]:
word_lemmatizer = WordNetLemmatizer()
def lemmatizer(text: list) -> list:
    return [word_lemmatizer.lemmatize(word) for word in text]

In [70]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aaronsteiner/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [71]:
test_data["lemmatized_tokens"] = test_data["clean_token"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_tokens"] = train_data["clean_token"].apply(lambda x: lemmatizer(x))
test_data["lemmatized_hashtags"] = test_data["clean_hashtags"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_hashtags"] = train_data["clean_hashtags"].apply(lambda x: lemmatizer(x))
test_data.head()

Unnamed: 0,id,tweet,n_mentions,hashtags,without_punctuation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags
0,31963,#studiolife #aislife #requires #passion #dedic...,0,"[studiolife, aislife, requires, passion, dedic...",studiolife aislife requires passion dedication...,studiolife aislife requires passion dedication...,"[studiolife, aislife, requires, passion, dedic...","[studiolife, aislife, requires, passion, dedic...","[studiolife, aislife, requires, passion, dedic...","[studiolif, aislif, requir, passion, dedic, wi...","[studiolif, aislif, requir, passion, dedic, wi...","[studiolife, aislife, requires, passion, dedic...","[studiolife, aislife, requires, passion, dedic..."
1,31964,@user #white #supremacists want everyone to s...,1,"[white, supremacists, birdsâ, movie]",user white supremacists want everyone to see ...,user white supremacists want everyone to see ...,"[user, white, supremacists, want, everyone, to...","[user, white, supremacists, want, everyone, se...","[white, supremacists, birdsâ, movie]","[user, white, supremacist, want, everyon, see,...","[white, supremacist, birdsâ, movi]","[user, white, supremacist, want, everyone, see...","[white, supremacist, birdsâ, movie]"
2,31965,safe ways to heal your #acne!! #altwaystohe...,0,"[acne, altwaystoheal, healthy, healing]",safe ways to heal your acne altwaystoheal h...,safe ways to heal your acne altwaystoheal h...,"[safe, ways, to, heal, your, acne, altwaystohe...","[safe, ways, heal, acne, altwaystoheal, health...","[acne, altwaystoheal, healthy, healing]","[safe, way, heal, acn, altwaystoh, healthi, heal]","[acn, altwaystoh, healthi, heal]","[safe, way, heal, acne, altwaystoheal, healthy...","[acne, altwaystoheal, healthy, healing]"
3,31966,is the hp and the cursed child book up for res...,0,"[harrypotter, pottermore, favorite]",is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"[is, the, hp, and, the, cursed, child, book, u...","[hp, cursed, child, book, reservations, alread...","[harrypotter, pottermore, favorite]","[hp, curs, child, book, reserv, alreadi, ye, ð...","[harrypott, pottermor, favorit]","[hp, cursed, child, book, reservation, already...","[harrypotter, pottermore, favorite]"
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0,"[bihday, nephew]",3rd bihday to my amazing hilarious nephew el...,3rd bihday to my amazing hilarious nephew el...,"[3rd, bihday, to, my, amazing, hilarious, neph...","[3rd, bihday, amazing, hilarious, nephew, eli,...","[bihday, nephew]","[3rd, bihday, amaz, hilari, nephew, eli, ahmir...","[bihday, nephew]","[3rd, bihday, amazing, hilarious, nephew, eli,...","[bihday, nephew]"


## Tfidf

In [77]:
tf = TfidfVectorizer()

X_vec = tf.fit(train_data[["lemmatized_tokens", "lemmatized_hashtags"]])
train_data = X_vec.transform(train_data[["lemmatized_tokens", "lemmatized_hashtags"]])

In [81]:
print(type(train_data))

<class 'scipy.sparse._csr.csr_matrix'>


## Split Data

In [None]:
X = train_data.loc[:, train_data.columns != "label"]
Y = train_data.loc[train_data.label]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data, Y, test_size=0.2, random_state=55)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=55)

In [None]:
X_test.to_csv("../../data/220505_test_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
X_train.to_csv("../../data/220505_train_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
X_val.to_csv("../../data/220505_validation_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)

# Work in progress

In [None]:
import texthero as hero
import pandas as pd

train_data = pd.read_csv("../../data/220502_train_data_preprocessed.csv", sep=';')
train_data["tfidf_stemmed_tokens"] = (hero.tfidf(train_data["stemmed_tokens"], max_features=8000))
train_data["tfidf_stemmed_tokens"] = (hero.pca(train_data["tfidf_stemmed_tokens"], n_components=500))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_data["tfidf_stemmed_hashtags"] = (hero.tfidf(train_data["stemmed_hashtags"], max_features=8000))
train_data["tfidf_stemmed_hashtags"] = (hero.pca(train_data["tfidf_stemmed_hashtags"], n_components=200))

In [None]:
train_data["tfidf_lemmatized_tokens"] = (hero.tfidf(train_data["lemmatized_tokens"], max_features=8000))
train_data["tfidf_lemmatized_tokens"] = (hero.pca(train_data["tfidf_lemmatized_tokens"], n_components=500))

In [None]:
train_data["tfidf_lemmatized_hashtags"] = (hero.tfidf(train_data["lemmatized_hashtags"], max_features=8000))
train_data["tfidf_lemmatized_hashtags"] = (hero.pca(train_data["tfidf_lemmatized_hashtags"], n_components=200))