In [None]:
import pandas as pd
import numpy as np 

### load the data

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_target = pd.read_csv('sample_submission.csv')

In [None]:
df_train.isna().sum()


In [None]:
df_test.isna().sum()


In [None]:
df_target.isna().sum()


In [None]:
df_train.keyword = df_train.keyword.fillna(df_train.keyword.mode()[0])
df_test.keyword = df_test.keyword.fillna(df_test.keyword.mode()[0])
df_train = df_train.dropna()
df_test = df_test.dropna()

In [None]:

df_train

###  Text Clean 

In [None]:
# Libraries and packages for text (pre-)processing 
import string
import re
import nltk

In [None]:
df_train["clean_text"] = df_train["text"].apply(lambda x: x.lower())

In [None]:
!pip install contractions

In [None]:
import contractions

In [None]:
df_train["clean_text"] = df_train["clean_text"].apply(lambda x: contractions.fix(x))

In [None]:
df_train["clean_text"]

In [None]:
print(df_train["text"][67])
print(df_train["clean_text"][67])

In [None]:
def remove_URL(text):
    """
        Remove URLs from a sample string
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)



def remove_html(text):
    """
        Remove the html in sample text
    """
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)


def remove_non_ascii(text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r'', text) # or ''.join([x for x in text if x in string.printable]) 

def remove_punct(text):
    """
        Remove the punctuation
    """
#     return re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', "", text)
    return text.translate(str.maketrans('', '', string.punctuation))

In [None]:
df_train["clean_text"] = df_train["clean_text"].apply(lambda x: remove_URL(x))

df_train["clean_text"] = df_train["clean_text"].apply(lambda x: remove_html(x))

df_train["clean_text"] = df_train["clean_text"].apply(lambda x: remove_non_ascii(x))

df_train["clean_text"] = df_train["clean_text"].apply(lambda x: remove_punct(x))

###  Text Preprocessing 

In [None]:
%time
# Tokenizing the tweet base texts.
from nltk.tokenize import word_tokenize

df_train['tokenized'] = df_train['clean_text'].apply(word_tokenize)
df_train.head()

In [None]:
# Removing stopwords.
nltk.download("stopwords")
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))
df_train['stopwords_removed'] = df_train['tokenized'].apply(lambda x: [word for word in x if word not in stop])
df_train.head()

In [None]:
from nltk.stem import SnowballStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import PorterStemmer


def porter_stemmer(text):
    """
        Stem words in list of tokenized words with PorterStemmer
    """
    stemmer = nltk.PorterStemmer()
    stems = [stemmer.stem(i) for i in text]
    return stems

def lancaster_stemmer(text):
    """
        Stem words in list of tokenized words with LancasterStemmer
    """
    stemmer = nltk.LancasterStemmer()
    stems = [stemmer.stem(i) for i in text]
    return stems

def snowball_stemmer(text):
    """
        Stem words in list of tokenized words with SnowballStemmer
    """
    stemmer = nltk.SnowballStemmer("english")
    stems = [stemmer.stem(i) for i in text]
    return stems




In [None]:
%time 

df_train['porter_stemmer'] = df_train['stopwords_removed'].apply(lambda x: porter_stemmer(x))

df_train['lancaster_stemmer'] = df_train['stopwords_removed'].apply(lambda x: lancaster_stemmer(x))

df_train['snowball_stemmer'] = df_train['stopwords_removed'].apply(lambda x: snowball_stemmer(x))
df_train.head()

In [None]:
# uncomments if u havent download it yet
# nltk.download('brown')

from nltk.corpus import wordnet
from nltk.corpus import brown

wordnet_map = {"N":wordnet.NOUN, 
               "V":wordnet.VERB, 
               "J":wordnet.ADJ, 
               "R":wordnet.ADV
              }
    
train_sents = brown.tagged_sents(categories='news')
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)

def pos_tag_wordnet(text, pos_tag_type="pos_tag"):
    """
        Create pos_tag with wordnet format
    """
    pos_tagged_text = t2.tag(text)
    
    # map the pos tagging output with wordnet output 
    pos_tagged_text = [(word, wordnet_map.get(pos_tag[0])) if pos_tag[0] in wordnet_map.keys() else (word, wordnet.NOUN) for (word, pos_tag) in pos_tagged_text ]
    return pos_tagged_text

In [None]:
%time 

df_train['combined_postag_wnet'] = df_train['stopwords_removed'].apply(lambda x: pos_tag_wordnet(x))

In [None]:
from nltk.stem import WordNetLemmatizer

def lemmatize_word(text):
    """
        Lemmatize the tokenized words
    """

    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word, tag) for word, tag in text]
    return lemma

In [None]:
# Test without POS Tagging
lemmatizer = WordNetLemmatizer()

df_train['lemmatize_word_wo_pos'] = df_train['stopwords_removed'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df_train['lemmatize_word_wo_pos'] = df_train['lemmatize_word_wo_pos'].apply(lambda x: [word for word in x if word not in stop])
df_train.head()

In [None]:
%time 

# Test with POS Tagging
lemmatizer = WordNetLemmatizer()

df_train['lemmatize_word_w_pos'] = df_train['combined_postag_wnet'].apply(lambda x: lemmatize_word(x))
df_train['lemmatize_word_w_pos'] = df_train['lemmatize_word_w_pos'].apply(lambda x: [word for word in x if word not in stop]) # double check to remove stop words
df_train['lemmatize_text'] = [' '.join(map(str, l)) for l in df_train['lemmatize_word_w_pos']] # join back to text

df_train.head()

In [None]:


print(df_train["text"][31])
print(df_train["combined_postag_wnet"][31])
print(df_train["lemmatize_word_wo_pos"][31])
print(df_train["lemmatize_word_w_pos"][31])



In [None]:
df_train

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def cv(data, ngram = 1, MAX_NB_WORDS = 75000):
    count_vectorizer = CountVectorizer(ngram_range = (ngram, ngram), max_features = MAX_NB_WORDS)
    emb = count_vectorizer.fit_transform(data).toarray()
    print("count vectorize with", str(np.array(emb).shape[1]), "features")
    return emb, count_vectorizer

In [None]:
def print_out(emb, feat, ngram, compared_sentence=0):
    print(ngram,"bag-of-words: ")
    print(feat.get_feature_names(), "\n")
    print(ngram,"bag-of-feature: ")
    print(test_cv_1gram.vocabulary_, "\n")
    print("BoW matrix:")
    print(pd.DataFrame(emb.transpose(), index = feat.get_feature_names()).head(), "\n")
    print(ngram,"vector example:")
    print(df_train["lemmatize_text"][compared_sentence])
    print(emb[compared_sentence], "\n")



In [None]:
test_corpus = df_train["lemmatize_text"][:5].tolist()
print("The test corpus: ", test_corpus, "\n")

test_cv_em_1gram, test_cv_1gram = cv(test_corpus, ngram=1)
print_out(test_cv_em_1gram, test_cv_1gram, ngram="Uni-gram")



In [None]:
!pip install --upgrade scikit-learn 

In [None]:
feat