In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/MyDrive/DataMining/Data_mining/src/data

/content/drive/MyDrive/DataMining/Data_mining/src/data


In [56]:
!pip install texthero

In [3]:
!pip install demoji



# Preproccesing the data

## Importing needed libraries

In [3]:
import pandas as pd
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import demoji


demoji.download_codes()

  del sys.path[0]


## Loading data

In [59]:
train_data = pd.read_csv("../../data/train_tweet.csv")
test_data = pd.read_csv("../../data/test_tweets.csv")

## Work on emojis 
Convert emojis to their corresponding text

In [60]:
def convert_emoji(text:str, mode:str = "parse_emojis") -> str:
    """Modes: "parse_emojis", "extract_emojis" """
    #convert string to binary representation
    binary = ' '.join(format(ord(x), 'b') for x in text)

    #convert binary representation to utf8 representation
    listRes = list(binary.split(" "))
    try:
        text_with_emoji = bytes([int(x,2) for x in listRes]).decode('utf-8')
    except UnicodeDecodeError:
        return text
        
    #get all emojis
    dictionary = demoji.findall(text_with_emoji)

    #replace emojis with text representation
    text_without_emoji = text_with_emoji
    emojis = []
    for key in dictionary.keys():
        if key in text_with_emoji: emojis.append(dictionary[key])
        text_with_emoji = text_with_emoji.replace(key, dictionary[key] + " ")        
    if mode == "parse_emojis":
        return text_with_emoji
    elif mode == "extract_emojis":
        return emojis
    
    

In [61]:
train_data["tweet_converted_emojis"] = train_data["tweet"].apply(lambda x: convert_emoji(x, "parse_emojis"))
test_data["tweet_converted_emojis"] = test_data["tweet"].apply(lambda x: convert_emoji(x, "parse_emojis"))

In [62]:
train_data["emojis"] = train_data["tweet"].apply(lambda x: convert_emoji(x, "extract_emojis"))
test_data["emojis"] = test_data["tweet"].apply(lambda x: convert_emoji(x, "extract_emojis"))

## Deal with user mentions


In [63]:
def count_user_mentions(text:str) ->int:
    return text.count("@user")
    

In [64]:
test_data["n_mentions"] = test_data["tweet_converted_emojis"].apply(lambda x: count_user_mentions(x))
train_data["n_mentions"] = train_data["tweet_converted_emojis"].apply(lambda x: count_user_mentions(x))

## Deal with hashtags

In [65]:
def identify_hashtags(text:str) -> list:
    pattern = re.compile(r"#(\w+)")
    return pattern.findall(text)

In [66]:
test_data["hashtags"] = test_data["tweet_converted_emojis"].apply(lambda x: identify_hashtags(x))
train_data["hashtags"] = train_data["tweet_converted_emojis"].apply(lambda x: identify_hashtags(x))

## Punctuation Removal

Create helper function

In [67]:
def remove_punctuation(text:str) -> str:
    return "".join([i for i in text if i not in punctuation])

In [68]:
test_data["without_punctuation"] = test_data["tweet_converted_emojis"].apply(lambda x: remove_punctuation(x))
train_data["without_punctuation"] = train_data["tweet_converted_emojis"].apply(lambda x: remove_punctuation(x))

## Lowering text 

In [69]:
test_data["tweet_lower"] = test_data["without_punctuation"].apply(lambda x: x.lower())
train_data["tweet_lower"] = train_data["without_punctuation"].apply(lambda x: x.lower())

## Tokenization

In [70]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [71]:
def tokenization(text:str) -> list:
    return nltk.word_tokenize(text)

In [72]:
test_data["tweet_token"] = test_data["tweet_lower"].apply(lambda x: tokenization(x))
train_data["tweet_token"] = train_data["tweet_lower"].apply(lambda x: tokenization(x))

## Remove Stopwords

In [73]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
def remove_stopwords(tokens) ->list:
    stopwords_list = stopwords.words("english")
    return [token for token in tokens if token not in stopwords_list]

In [75]:
test_data["clean_token"] = test_data["tweet_token"].apply(lambda x: remove_stopwords(x))
train_data["clean_token"] = train_data["tweet_token"].apply(lambda x: remove_stopwords(x))
test_data["clean_hashtags"] = test_data["hashtags"].apply(lambda x: remove_stopwords(x))
train_data["clean_hashtags"] = train_data["hashtags"].apply(lambda x: remove_stopwords(x))

## Stemming

In [76]:
porter_stemmer = PorterStemmer()

def stemming(text:list) -> list:
    return [porter_stemmer.stem(word) for word in text]


In [77]:
test_data["stemmed_tokens"] = test_data["clean_token"].apply(lambda x: stemming(x))
train_data["stemmed_tokens"] = train_data["clean_token"].apply(lambda x: stemming(x))
test_data["stemmed_hashtags"] = test_data["clean_hashtags"].apply(lambda x: stemming(x))
train_data["stemmed_hashtags"] = train_data["clean_hashtags"].apply(lambda x: stemming(x))

Result does not look great (e.g. movie -> movi)

## Lemmatization

In [78]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [79]:
word_lemmatizer = WordNetLemmatizer()
def lemmatizer(text: list) -> list:
    return [word_lemmatizer.lemmatize(word) for word in text]

In [80]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [81]:
test_data["lemmatized_tokens"] = test_data["clean_token"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_tokens"] = train_data["clean_token"].apply(lambda x: lemmatizer(x))
test_data["lemmatized_hashtags"] = test_data["clean_hashtags"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_hashtags"] = train_data["clean_hashtags"].apply(lambda x: lemmatizer(x))

## Temp Export


In [82]:
#test_data.to_csv("../../data/220510_test_data_preprocessed_temp.csv", sep=";", encoding="utf-8", index=False)
train_data.to_csv("../../data/220510_train_data_preprocessed_temp.csv", sep=";", encoding="utf-8", index=False)

In [279]:
train_data.head()

Unnamed: 0,id,label,tweet,tweet_converted_emojis,emojis,n_mentions,hashtags,without_punctuation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags
0,1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...,[],1,['run'],user when a father is dysfunctional and is so...,user when a father is dysfunctional and is so...,"['user', 'when', 'a', 'father', 'is', 'dysfunc...","['user', 'father', 'dysfunctional', 'selfish',...",['run'],"['user', 'father', 'dysfunct', 'selfish', 'dra...",['run'],"['user', 'father', 'dysfunctional', 'selfish',...",['run']
1,2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...,[],2,"['lyft', 'disapointed', 'getthanked']",user user thanks for lyft credit i cant use ca...,user user thanks for lyft credit i cant use ca...,"['user', 'user', 'thanks', 'for', 'lyft', 'cre...","['user', 'user', 'thanks', 'lyft', 'credit', '...","['lyft', 'disapointed', 'getthanked']","['user', 'user', 'thank', 'lyft', 'credit', 'c...","['lyft', 'disapoint', 'getthank']","['user', 'user', 'thanks', 'lyft', 'credit', '...","['lyft', 'disapointed', 'getthanked']"
2,3,0,bihday your majesty,bihday your majesty,[],0,[],bihday your majesty,bihday your majesty,"['bihday', 'your', 'majesty']","['bihday', 'majesty']",[],"['bihday', 'majesti']",[],"['bihday', 'majesty']",[]
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...,"['sweat droplets', 'mobile phone', 'mouth', 't...",0,['model'],model i love u take with u all the time in u...,model i love u take with u all the time in u...,"['model', 'i', 'love', 'u', 'take', 'with', 'u...","['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model'],"['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model'],"['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model']
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation,[],0,['motivation'],factsguide society now motivation,factsguide society now motivation,"['factsguide', 'society', 'now', 'motivation']","['factsguide', 'society', 'motivation']",['motivation'],"['factsguid', 'societi', 'motiv']",['motiv'],"['factsguide', 'society', 'motivation']",['motivation']


## Split Data

In [17]:
train_data = pd.read_csv("../../data/220510_train_data_preprocessed_temp.csv", sep=';')

In [18]:
train_data, test_data = train_test_split(train_data, test_size=0.2, random_state=17, stratify=train_data.label)
train_data, val_data = train_test_split(train_data, test_size=0.125, random_state=17, stratify=train_data.label)

In [19]:
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Work in progress

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import math
import numpy as np

def transform_reduce(transformer, reducer, data):
    result = reducer.transform(transformer.transform(data.values.tolist()))
    #result = transformer.transform(data.values.tolist()).toarray() #Only 600 words in vocabulary... reduction not really necessary ..
    return pd.Series(list(result))

def tfidf_transform_data(source_column, target_column, train_data, val_data, test_data):

    #Fit Count to training data:
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x, max_features=8000, preprocessor=lambda x: x)
    vectorizer.fit(train_data[source_column].values.tolist())
    print(len(vectorizer.vocabulary_))

    #Fit SVD to training data
    tfidf_train_data = vectorizer.transform(train_data[source_column].values.tolist())
    try:
        svd = TruncatedSVD(n_components=300)
        svd.fit(tfidf_train_data)
    except ValueError:
        svd = TruncatedSVD(n_components=20)
        svd.fit(tfidf_train_data)

    #Transform Data:
    train_data[target_column] = transform_reduce(vectorizer, svd, train_data[source_column])
    val_data[target_column] = transform_reduce(vectorizer, svd, val_data[source_column])
    test_data[target_column] = transform_reduce(vectorizer, svd, test_data[source_column])

    return train_data, val_data, test_data

In [20]:
train_data, val_data, test_data = tfidf_transform_data("stemmed_tokens", "tfidf_stemmed_tokens", train_data, val_data, test_data)
train_data, val_data, test_data = tfidf_transform_data("stemmed_hashtags", "tfidf_stemmed_hashtags", train_data, val_data, test_data)
train_data, val_data, test_data = tfidf_transform_data("lemmatized_tokens", "tfidf_lemmatized_tokens", train_data, val_data, test_data)
train_data, val_data, test_data = tfidf_transform_data("lemmatized_hashtags", "tfidf_lemmatized_hashtags", train_data, val_data, test_data)
train_data, val_data, test_data = tfidf_transform_data("emojis", "tfidf_emojis", train_data, val_data, test_data)

  "The parameter 'token_pattern' will not be used"


610


  "The parameter 'token_pattern' will not be used"


443


  "The parameter 'token_pattern' will not be used"


610


  "The parameter 'token_pattern' will not be used"


448


  "The parameter 'token_pattern' will not be used"


66


In [56]:
def emb_data(source_column, data, nlp):

    #Fit Count to training data:
    tweets = data[source_column].values.tolist()
    nlp.disable_pipes("parser", "ner", "lemmatizer") #remove pipe we do not need
    embeddings = [sum([word.vector for word in item])/len(item) for item in nlp.pipe(tweets)]
    return pd.Series(embeddings)

In [None]:
!python -m spacy download en_core_web_lg


In [51]:
nlp = spacy.load("en_core_web_lg")


In [59]:
import spacy

test_data["emb"] = emb_data("tweet", test_data, nlp)
val_data["emb"] = emb_data("tweet", val_data, nlp)
train_data["emb"] = emb_data("tweet", train_data, nlp)

In [62]:
len(test_data["emb"][0])


300

#Save Data

In [243]:
test_data.to_csv("../../data/220510_test_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
train_data.to_csv("../../data/220510_train_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
val_data.to_csv("../../data/220510_validation_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)

In [60]:
import pickle
pickle.dump(test_data, open("../../data/220510_test_data_preprocessed.pickle", "wb"))
pickle.dump(train_data, open("../../data/220510_train_data_preprocessed.pickle", "wb"))
pickle.dump(val_data, open("../../data/220510_validation_data_preprocessed.pickle", "wb"))