In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
#import utilities

# text processing
import re
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer 

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# models
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/avicenne/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/avicenne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


# Function to clean the text

In [218]:
def remove_stop_word(text):
    """
        remove stop word in text, 
        such as the, as, is...
        
        Args : 
            text (str) :
                the text we want to remove stopword
                
        returns :
            text without stop word
    """
    
    STOPWORDS = set(stopwords.words('english'))

    return " ".join([word for word in text.split() if word not in STOPWORDS])
    

def del_pattern(pattern, text):
    """
        Del specific pattern in text
        
        Args :
            pattern (str) : 
                the pattern we want to remove
            
            text (str) :
                the text we want to clean
        
        Returns :
            text without pattern
    """
    
    return re.sub(pattern, '', text)

def lemmatize(text):
    """
        Lemmatize text : 
        bats -> bat
        cats -> cat
        
        Args :
            text (str) :
                the text we want to lemmatize
                
        returns :
            the text lemmatize
    """
    
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w) for w in text.split()])


def del_patterns(patterns, text):
    """
        Del specific patterns in text
        
        Args :
            pattern (list) : 
                the patterns we want to remove
            
            text (str) :
                the text we want to clean
        
        Returns :
            text without patterns
    """
    
    for pattern in patterns:
        text = del_pattern(pattern, text)
    
    return text

def clean_text(text, correction_dict):
    """ 
        The goal og this function is to clean a text by removing stopwords, 
        special caracters...

        Args:
            text (str): 
                text we want to clean
                
            correction_dict (dict):
                word associated to its correction
                mainly disaster words

        Returns:
            the text cleaned
    """
    
    text = str(text).lower()
    
    special_char = r'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    url = re.compile(r"https?://\S+|www\.\S+")
    html = re.compile(r"<.*?>")    
    num_in_words = r'\b\w*\d+\w*\b'
    custom = r'(\x89|û|ó)'
    emoji = re.compile(r"[\U0001F600-\U0001F64F"  # Emoticons
                   "\U0001F300-\U0001F5FF"  # Symbols & pictographs
                   "\U0001F680-\U0001F6FF"  # Transport & map symbols
                   "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                   "\U00002702-\U000027B0"  # Dingbats
                   "\U000024C2-\U0001F251"  # Enclosed characters
                   "]+", 
                  flags=re.UNICODE)
    
    extensions = ['com', 'fr', 'org', 'net', 'edu', 'gov', 'uk', 'de', 'jp', 'au', 'ca', 'us', 'info', 'biz', 'co']
    web_site = r'\b\S+\.(?:' + '|'.join(extensions) + r')\b'
    
    patterns = [url,
                num_in_words,
               html,
               emoji,
               web_site,
               rf'[{re.escape(special_char)}]']
    
    text = del_patterns(patterns, text)
    
    for word in correction_dict:
        text = text.replace(word, correction_dict[word])
        
    return lemmatize(''.join(text))

def get_duplicate_text(data : pd.DataFrame, name_col_text: str):
    """
        get the text dulpicated
        
        Args:
            data (pd.DataFrame): 
                data we want to extract unique duplicated text
                
            name_col_text (str):
                the name of the column in the dataframe that contains the text
                
        Returns:
            the list of text 
    """
    list_text = []
    for value, txt in zip(data[name_col_text].value_counts().tolist(), data[name_col_text].value_counts().index):
        if value>1:
            list_text.append(txt)
            
    return list_text

def get_index_different_target(train, texts):
    """
        Among the texts duplicated we look after
        those which do not have the same target

        Args:
            train (pd.DataFrame): 
                the dataframe we want to extract the index
                
            texts (list):
                list of duplicated text
                
        Returns:
            the list of index
    """
    list_index = {}
    for i, text in enumerate(texts):
        list_index["text{}".format(i)] = {"index" : [],
                                          "target" : []}
        
        for ind, data_txt in zip(train.text.index, train.text.tolist()):
                if data_txt in text:
                    list_index["text{}".format(i)]["index"].append(ind)
                    list_index["text{}".format(i)]["target"].append(train.target.loc[ind])
                    
    new_list_index = {}
    for j in list_index.keys():
        if len(Counter(list_index[j]["target"]))>1:
            new_list_index[j] = list_index[j]
            # print(list_index[j]["index"], train.loc[list_index[j]["index"][0]].text, end= '\n')
    return new_list_index

def hard_coded_targetting(train):
    """
        hard coded function to associate correct target 
        for duplicates text with differents target

    Args:
        train (pd.DataFrame): 
            the train dataframe
            
    Returns:
        the train with duplicates text and correct targets
    """
    
    dict_target ={
                3240: 0, 3243: 0, 3248: 0, 3251: 0, 3261: 0, 3266: 0,
                4284: 0, 4286: 0, 4292: 0, 4304: 0, 4309: 0, 4318: 0,
                6091: 0, 6094: 0, 6103: 0, 6123: 0,
                610: 0, 624: 0, 630: 0, 634: 0,
                2830: 0, 2831: 0, 2832: 0, 2833: 0,
                3985: 0, 4013: 0, 4019: 0,
                4285: 0, 4305: 0, 4313: 0,
                4290: 0, 4299: 0, 4312: 0,
                4597: 1, 4605: 1, 4618: 1, 4623: 1, 4631: 1,
                4221: 0, 4239: 0, 4244: 0,
                4232: 0, 4235: 0,
                4379: 1, 4381: 1,
                5620: 1, 5641: 1,
                1197: 0, 1331: 0,
                6614: 0, 6616: 0,
                1221: 0, 1349: 0,
                1214: 0, 1365: 0,
                4306: 0, 4320: 0,
                4285: 0, 4294: 0, 4305: 0, 4308: 0, 4313: 0
                }

    
    for ind in dict_target.keys():
        train.loc[ind, "target"] = dict_target[ind]
        
    return train

def get_unk_words(data, glove_dict, label_text_column):
    """
        Get unk word taht are not in glove after data cleaning

        Args:
            data (list): 
                list of texts
                
            glove_dict (dict): 
                glove word dict
                
        Returns:
            the list of unk word
    """
    
    list_unk = []
    tot_words = 0
    
    for text in data:
        # make sure there is a text
        if isinstance(text, str):
            for word in text.split():
                tot_words += 1
                if word not in glove_dict:
                    list_unk.append(word)
                    
    print(f"there a total of {tot_words} for {len(list(Counter(list_unk).keys()))} unk words that are not in glove")
    
    return list_unk


In [157]:
def remove_words_with_digits(text):
    pattern = r'\b\w*\d+\w*\b'
    cleaned_text = re.sub(pattern, '', text)
    #cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

text = "Ce texte contient des mots comme e10march, \x89ûó, simpletext, et d'autres mots normaux."
cleaned_text = remove_words_with_digits(text)
print(cleaned_text)

Ce texte contient des mots comme , ûó, simpletext, et d'autres mots normaux.


# Glove embedding

In [121]:
embed_dict = {}
word_dict = {}
with open("/Users/avicenne/Documents/python/NLP/Glove/glove.6B.100d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        word_dict[word] = word
        vector = np.asarray(values[1:], "float32")
        embed_dict[word] = vector

# Some corrected words related to disasters

In [214]:
hard_coded_dict = {
    "goooooooaaaaaal": "goal",
    "looooool": "lol",
    "newsnigeria": "news nigeria",
    "southridgelife": "south ridgde life",
    "nowplaying": "now playing",
    "û÷hijacker": "hijacker",
    "phdsquares": "phd squares",
    "personalinjury": "personal injury",
    "caraccidentlawyer": "car accident lawyer",
    "accidentwho": "accident who",
    "truckcrash": "truck crash",
    "crashgt": "crash",
    "damagenhs": "damages",
    "damagewpd": "damaged",
    "nearfatal":  "near fatal",
    "southaccident": "south accident",
    "measuresarrestpastornganga": "measures arrest pastor and gang",
    "aftershockdelo": "after shock",
    "lifehacks": "lifehacker",
    "onfireanders": "on fire anders",
    "aftershockorg": "after shock",
    "butthe": "battle",
    "breakfastone": "break fast one",
    "uptotheminute": "up to the minute",
    "warmbodies": "warm bodies",
    "aggressif": "aggressive",
    "allah": "god"
}

# Duplicates text with differents target

In [215]:
train = pd.read_csv("/Users/avicenne/Documents/python/Project-github/Kaggle/nlp-getting-started/train.csv")
test = pd.read_csv("/Users/avicenne/Documents/python/Project-github/Kaggle/nlp-getting-started/test.csv")

list_text_before = get_duplicate_text(train, 'text')
print("number of text duplicates with different targets :",len(get_index_different_target(train, list_text_before)))

train = hard_coded_targetting(train)
list_text_after = get_duplicate_text(train, 'text')
print("number of text duplicates with different targets :",len(get_index_different_target(train, list_text_after)))

X_test = test["text"].apply(lambda x: clean_text(x, hard_coded_dict)).tolist()
id_test = test["id"]

X_train = train["text"].apply(lambda x: clean_text(x, hard_coded_dict)).tolist()
Y_train = train["target"].tolist()

number of text duplicates with different targets : 19
number of text duplicates with different targets : 0


# Display word that are not in glove

In [177]:
for i in word_dict:
    if "lmao" in i:
        print(i)

In [219]:
unk_words = list(Counter(get_unk_words(X_train, word_dict, "text")).keys())

there a total of 103771 for 4756 unk words that are not in glove


In [202]:
from tqdm import tqdm

from autocorrect import Speller
spell = Speller(lang="en")

dict_correction= {}

for word in tqdm(unk_words):
    if word not in dict_correction:
        dict_correction[word] = [spell(word)]
    else:
        dict_correction[word].append(spell(word))

100%|██████████| 4781/4781 [05:31<00:00, 14.41it/s]


In [204]:
""" save the dict to take a look and build the hard coded correction dict

import json
file_path = 'dict_correction.json'

# Sauvegarder le dictionnaire dans un fichier JSON
with open(file_path, 'w') as file:
    json.dump(dict_correction, file, indent=4)"""

# TFIDF

TF-IDF (Term Frequency-Inverse Document Frequency): this is a widely-used method that calculates a score for each term by combining a value that depends on its frequency of appearance in a text (term frequency) and a second value that depends on its appearance in all texts. The advantages are that the method is simple to implement and therefore effective, filters out common terms by lowering their scores, and highlights important rare terms. However, sensitivity to rare terms can exaggerate the score, semantic relations are not taken into account and the method loses effectiveness on long texts.

In [220]:
tfidf_vectorizer = TfidfVectorizer() 

tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

In [221]:
x_train_tfidf, x_val_tfidf, y_train, y_val = train_test_split(tfidf_train_vectors,
                                                  Y_train,
                                                  test_size = 0.3, random_state = 42)
print(x_train_tfidf.shape)

(5329, 15109)


shape data : texts = [train, val, test]
             labels = [train, val, None]

In [222]:
x_train, x_val, y_train, y_val = train_test_split(X_train,
                                                  Y_train,
                                                  test_size = 0.3, random_state = 42)
print(len(x_train))

5329


In [223]:
texts = [i for i in x_train]
texts += [i for i in x_val]
texts += [i for i in X_test]

print(len(x_train))
print(len(x_val))
print(len(X_test))

5329
2284
3263


# create vocabulary

# Keras

In [224]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = 250
max_words = 100000
embedding_size=150
lr = 1e-3
lr_d = 0

# create voc
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
#labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)

# split data
x_train_keras = data[0:5329]
x_val_keras = data[5329 : 5329+2284]
x_test_keras = data[5329+2284 : 5329+2284+3263]

Found 19312 unique tokens.
Shape of data tensor: (10876, 250)


In [225]:
print('Shape of train tensor:', x_train_keras.shape)
print('Shape of validate tensor:', x_val_keras.shape)
print('Shape of test tensor:', x_test_keras.shape)

Shape of train tensor: (5329, 250)
Shape of validate tensor: (2284, 250)
Shape of test tensor: (3263, 250)


build embedding matrix

In [226]:
def get_embedding_matrix(word_index, embed_dict):

    embedding_dim = 100 #length of vect word
    max_words = 400000  #number of embedded word we take from glove (out of 400 000 words vect)
    embedding_matrix = np.zeros((max_words, embedding_dim)) # matrix vect words
    for word, ind in word_index.items():
        if ind < max_words:
            embedding_vector = embed_dict.get(word)
            if embedding_vector is not None:
                embedding_matrix[ind] = embedding_vector
    print(embedding_matrix.shape)
    return embedding_matrix
    
embedding_matrix_keras = get_embedding_matrix(word_index, embed_dict)

(400000, 100)


# Bert

In [227]:
import tensorflow as tf
from transformers import BertTokenizer

# Initialisation du tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizer les textes avec padding et truncation
def tokenize_(texts, tokenizer, maxlen):
    encoded_inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=250, return_tensors='tf')
    return encoded_inputs['input_ids']

data = tokenize_(texts, tokenizer, maxlen)

# Create dict word_unique_vocab -> index
bert_word_index = {word: idx for idx, word in enumerate(tokenizer.get_vocab().keys())}

print(f'Found {len(bert_word_index)} unique tokens.')
print('Shape of data tensor:', data.shape)

# split data
x_train_bert = data[0:5329]
x_val_bert = data[5329 : 5329+2284]
x_test_bert = data[5329+2284 : 5329+2284+3263]

Found 30522 unique tokens.
Shape of data tensor: (10876, 250)


In [228]:
print('Shape of train tensor:', x_train_bert.shape)
print('Shape of validate tensor:', x_val_bert.shape)
print('Shape of test tensor:', x_test_bert.shape)

Shape of train tensor: (5329, 250)
Shape of validate tensor: (2284, 250)
Shape of test tensor: (3263, 250)


In [229]:
embedding_matrix_bert = get_embedding_matrix(bert_word_index, embed_dict)

(400000, 100)


# CNN model

In [230]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Conv1D, MaxPooling1D
from keras.optimizers import Adam


model = Sequential()
model.add(Embedding(input_dim=400000, output_dim=100, input_length=250))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.build(input_shape=(None, 250))



In [231]:
# Définir les poids de la couche d'Embedding
model.layers[0].set_weights([np.array(embedding_matrix_keras)])
model.layers[0].trainable = False

#model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])

In [232]:
history = model.fit(np.array(x_train_keras), np.array(y_train),
                    epochs=20,
                    batch_size=512,
                    validation_data=(np.array(x_val_keras), np.array(y_val)))

Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 118ms/step - accuracy: 0.5723 - loss: 0.6792 - val_accuracy: 0.7531 - val_loss: 0.6068
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 102ms/step - accuracy: 0.7449 - loss: 0.5788 - val_accuracy: 0.7579 - val_loss: 0.5287
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 102ms/step - accuracy: 0.7683 - loss: 0.5068 - val_accuracy: 0.7890 - val_loss: 0.4723
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step - accuracy: 0.7864 - loss: 0.4646 - val_accuracy: 0.7968 - val_loss: 0.4571
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 103ms/step - accuracy: 0.8036 - loss: 0.4370 - val_accuracy: 0.8047 - val_loss: 0.4493
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step - accuracy: 0.8206 - loss: 0.4075 - val_accuracy: 0.7973 - val_loss: 0.4565
Epoch 7/20
[1m11/11[0m [3

In [240]:
history = model.fit(np.array(x_train_bert), np.array(y_train),
                    epochs=20,
                    batch_size=512,
                    validation_data=(np.array(x_val_bert), np.array(y_val)))

Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step - accuracy: 0.5239 - loss: 1.1202 - val_accuracy: 0.5797 - val_loss: 0.7679
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 104ms/step - accuracy: 0.5726 - loss: 0.7056 - val_accuracy: 0.5845 - val_loss: 0.6714
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step - accuracy: 0.6234 - loss: 0.6512 - val_accuracy: 0.6077 - val_loss: 0.6602
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 104ms/step - accuracy: 0.6690 - loss: 0.6315 - val_accuracy: 0.6243 - val_loss: 0.6535
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 112ms/step - accuracy: 0.6852 - loss: 0.6139 - val_accuracy: 0.6397 - val_loss: 0.6424
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step - accuracy: 0.7103 - loss: 0.5854 - val_accuracy: 0.6484 - val_loss: 0.6331
Epoch 7/20
[1m11/11[0m [3

# Classification

In [233]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

XGB = XGBClassifier()
    

XGB.fit(x_train_tfidf, y_train)

predictions_xgb = XGB.predict(x_val_tfidf)

print("Accuracy : ",  accuracy_score(y_val, predictions_xgb))

Accuracy :  0.7788966725043783


# Bagging

In [234]:
from sklearn.ensemble import BaggingClassifier

def bag_model(model, x_train, y_train, x_val, y_val, cross_val):
    bag = BaggingClassifier(estimator=model,
                            n_estimators=10, 
                            random_state=0).fit(x_train, y_train)

    predictions_bag = bag.predict(x_val)

    print("accuracy bag: ",  accuracy_score(y_val, predictions_bag))

    if cross_val:
        # Utiliser la validation croisée pour évaluer la performance du modèle
        scores = cross_val_score(bag, x_train, y_train, cv=5, scoring='accuracy')
        print("Cross-validation accuracy scores: ", scores)
        print("Mean cross-validation accuracy: ", scores.mean())
    return bag

In [236]:
bag_xgb = bag_model(XGBClassifier(random_state=42), x_train_tfidf, y_train, x_val_tfidf, y_val, False)

accuracy bag:  0.7880910683012259


In [50]:
#bag_cat = bag_model(CatBoostClassifier(random_state=42, verbose=0), x_train, y_train, x_val, y_val, False)

# Voting

In [51]:
xgb = XGBClassifier(random_state=42)
cat = CatBoostClassifier(random_state=42, verbose=0)
lgb = LGBMClassifier(random_state = 42, verbose=-1)

def voting_models(models, x_train, y_train, x_val, y_val, cross_val):
    
    voting = VotingClassifier(estimators=models,
                                    voting='soft')

    voting.fit(x_train, y_train)

    predictions_boost = voting.predict(x_val)

    print("accuracy boost: ",  accuracy_score(y_val, predictions_boost))

    if cross_val:
        # Utiliser la validation croisée pour évaluer la performance du modèle
        scores = cross_val_score(voting, x_train, y_train, cv=5, scoring='accuracy')
        print("Cross-validation accuracy scores: ", scores)
        print("Mean cross-validation accuracy: ", scores.mean())
    
    return voting

models = [('xgb', xgb), ('cat', cat), ('lgb', lgb)]

# vot = voting_models(models, x_train, y_train, x_val, y_val, False)

# Submission

In [241]:
def submission(model, test, Id, cnn):
    pred = model.predict(test)
    
    if cnn:
        pred_c = []
        for i in pred:
            if i>0.6:
                pred_c.append(1)
            else:
                pred_c.append(0)
        pred = pred_c

    submission = pd.DataFrame({'id': Id,
                               'target': pred})

    # Save submission to a CSV file
    submission.to_csv('submission.csv', index=False)
    
submission(model, np.array(x_test_keras), id_test, True)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [238]:
submission(bag_xgb, tfidf_test_vectors, id_test, False)