In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/DataMining/Data_mining/src/data

/content/drive/MyDrive/DataMining/Data_mining/src/data


# Preproccesing the data

## Importing needed libraries

In [None]:
import pandas as pd
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

## Loading data

In [None]:
train_data = pd.read_csv("../../data/train_tweet.csv")
test_data = pd.read_csv("../../data/test_tweets.csv")

Take a first look at the data

In [None]:
train_data.head()

In [None]:
test_data.head()

## Deal with user mentions


In [None]:
def count_user_mentions(text:str) ->int:
    return text.count("@user")
    

In [None]:
test_data["n_mentions"] = test_data["tweet"].apply(lambda x: count_user_mentions(x))
train_data["n_mentions"] = train_data["tweet"].apply(lambda x: count_user_mentions(x))
test_data.head()

## Deal with hashtags

In [None]:
def identify_hashtags(text:str) -> list:
    pattern = re.compile(r"#(\w+)")
    return pattern.findall(text)

In [None]:
test_data["hashtags"] = test_data["tweet"].apply(lambda x: identify_hashtags(x))
train_data["hashtags"] = train_data["tweet"].apply(lambda x: identify_hashtags(x))
test_data.head()

## Punctuation Removal

Create helper function

In [None]:
def remove_punctuation(text:str) -> str:
    return "".join([i for i in text if i not in punctuation])

In [None]:
test_data["without_punctuation"] = test_data["tweet"].apply(lambda x: remove_punctuation(x))
train_data["without_punctuation"] = train_data["tweet"].apply(lambda x: remove_punctuation(x))
test_data.head()

In [None]:
train_data.head(10)

## Lowering text 

In [None]:
test_data["tweet_lower"] = test_data["without_punctuation"].apply(lambda x: x.lower())
train_data["tweet_lower"] = train_data["without_punctuation"].apply(lambda x: x.lower())
train_data.head()

## Tokenization

In [None]:
nltk.download('punkt')

In [None]:
def tokenization(text:str) -> list:
    return nltk.word_tokenize(text)

In [None]:
test_data["tweet_token"] = test_data["tweet_lower"].apply(lambda x: tokenization(x))
train_data["tweet_token"] = train_data["tweet_lower"].apply(lambda x: tokenization(x))
test_data.head()

## Remove Stopwords

In [None]:
nltk.download('stopwords')

In [None]:
def remove_stopwords(tokens) ->list:
    stopwords_list = stopwords.words("english")
    return [token for token in tokens if token not in stopwords_list]

In [None]:
test_data["clean_token"] = test_data["tweet_token"].apply(lambda x: remove_stopwords(x))
train_data["clean_token"] = train_data["tweet_token"].apply(lambda x: remove_stopwords(x))
test_data["clean_hashtags"] = test_data["hashtags"].apply(lambda x: remove_stopwords(x))
train_data["clean_hashtags"] = train_data["hashtags"].apply(lambda x: remove_stopwords(x))
test_data.head()

## Stemming

In [None]:
porter_stemmer = PorterStemmer()

def stemming(text:list) -> list:
    return [porter_stemmer.stem(word) for word in text]


In [None]:
test_data["stemmed_tokens"] = test_data["clean_token"].apply(lambda x: stemming(x))
train_data["stemmed_tokens"] = train_data["clean_token"].apply(lambda x: stemming(x))
test_data["stemmed_hashtags"] = test_data["clean_hashtags"].apply(lambda x: stemming(x))
train_data["stemmed_hashtags"] = train_data["clean_hashtags"].apply(lambda x: stemming(x))
test_data.head()

Result does not look great (e.g. movie -> movi)

## Lemmatization

In [None]:
nltk.download("wordnet")

In [None]:
word_lemmatizer = WordNetLemmatizer()
def lemmatizer(text: list) -> list:
    return [word_lemmatizer.lemmatize(word) for word in text]

In [None]:
nltk.download('omw-1.4')

In [None]:
test_data["lemmatized_tokens"] = test_data["clean_token"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_tokens"] = train_data["clean_token"].apply(lambda x: lemmatizer(x))
test_data["lemmatized_hashtags"] = test_data["clean_hashtags"].apply(lambda x: lemmatizer(x))
train_data["lemmatized_hashtags"] = train_data["clean_hashtags"].apply(lambda x: lemmatizer(x))
test_data.head()

## Export


In [None]:
test_data.to_csv("../../data/220502_test_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
train_data.to_csv("../../data/220502_train_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)

## Tfidf

In [None]:
!pip install texthero

Collecting texthero
  Downloading texthero-1.1.0-py3-none-any.whl (24 kB)
Collecting unidecode>=1.1.1
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 5.6 MB/s 
[?25hCollecting nltk>=3.3
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 21.6 MB/s 
Collecting regex>=2021.8.3
  Downloading regex-2022.4.24-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 55.6 MB/s 
Installing collected packages: regex, unidecode, nltk, texthero
  Attempting uninstall: regex
    Found existing installation: regex 2019.12.20
    Uninstalling regex-2019.12.20:
      Successfully uninstalled regex-2019.12.20
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.7 regex-2022.4.24 texthero-1.1.0 unidecode-1.3.4


In [None]:
import texthero as hero
import pandas as pd

train_data = pd.read_csv("../../data/220502_train_data_preprocessed.csv", sep=';')
train_data["tfidf_stemmed_tokens"] = (hero.tfidf(train_data["stemmed_tokens"], max_features=8000))
train_data["tfidf_stemmed_tokens"] = (hero.pca(train_data["tfidf_stemmed_tokens"], n_components=500))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_data["tfidf_stemmed_hashtags"] = (hero.tfidf(train_data["stemmed_hashtags"], max_features=8000))
train_data["tfidf_stemmed_hashtags"] = (hero.pca(train_data["tfidf_stemmed_hashtags"], n_components=200))

In [None]:
train_data["tfidf_lemmatized_tokens"] = (hero.tfidf(train_data["lemmatized_tokens"], max_features=8000))
train_data["tfidf_lemmatized_tokens"] = (hero.pca(train_data["tfidf_lemmatized_tokens"], n_components=500))

In [None]:
train_data["tfidf_lemmatized_hashtags"] = (hero.tfidf(train_data["lemmatized_hashtags"], max_features=8000))
train_data["tfidf_lemmatized_hashtags"] = (hero.pca(train_data["tfidf_lemmatized_hashtags"], n_components=200))

## Split Data

In [None]:
X = train_data.loc[:, train_data.columns != "label"]
Y = train_data.loc[train_data.label]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data, Y, test_size=0.2, random_state=55)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=55)

In [None]:
X_test.to_csv("../../data/220505_test_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
X_train.to_csv("../../data/220505_train_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)
X_val.to_csv("../../data/220505_validation_data_preprocessed.csv", sep=";", encoding="utf-8", index=False)

# Work in progress

## Work on emojis 
Convert emojis to their corresponding text

In [None]:
text = "#model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  "
test = b'#model   i love u take with u all the time in ur\xc3\xb0\xc2\x9f\xc2\x93\xc2\xb1!!! \xc3\xb0\xc2\x9f\xc2\x98\xc2\x99\xc3\xb0\xc2\x9f\xc2\x98\xc2\x8e\xc3\xb0\xc2\x9f\xc2\x91\xc2\x84\xc3\xb0\xc2\x9f\xc2\x91\xc2\x85\xc3\xb0\xc2\x9f\xc2\x92\xc2\xa6\xc3\xb0\xc2\x9f\xc2\x92\xc2\xa6\xc3\xb0\xc2\x9f\xc2\x92\xc2\xa6  '

test.decode('utf-8')

In [None]:
test = "#model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  "
print(emot.emoji(test))

print(test)