In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utilities
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# We import glove file in order to explore new method based on its word vect representation

In [5]:
"""embed_dict = {}
word_dict = {}
with open("/kaggle/input/glove-100d/glove.6B.100d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        word_dict[word] = word
        vector = np.asarray(values[1:], "float32")
        embed_dict[word] = vector"""


'embed_dict = {}\nword_dict = {}\nwith open("/kaggle/input/glove-100d/glove.6B.100d.txt", \'r\', encoding="utf-8") as f:\n    for line in f:\n        values = line.split()\n        word = values[0]\n        word_dict[word] = word\n        vector = np.asarray(values[1:], "float32")\n        embed_dict[word] = vector'

In [6]:
def clean_data(text):
    """ The goal og this function is to clean a text by removing stopwords, special caracters...

    Args:
        text (str): 
            text we want to clean
        
        acronymes (dict): 
            dict of accronymes with its long word associated

    Returns:
        text : 
            the text cleaned
    """

    text = str(text)
    #-------------------------------------------------------------------------------------------------

    ## on gère tout ce qui est distances, mesures... (ex : 12inchs, 12cm, 12gm...)
    ## en gros on supprime tous les mots qui ont des nombres collés
    
    text = re.sub(r'\d+', '', text) ## on supp les chiffres collés aux mots
    
    #-------------------------------------------------------------------------------------------------
    ## on met tout en minuscule
    text = text.lower()
    
    #-------------------------------------------------------------------------------------------------

    ## on retire la ponctuation
    a_supp = r'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    text = re.sub(rf'[{re.escape(a_supp)}]', '', text)
    
    #-------------------------------------------------------------------------------------------------

    ## on retire les liens
    url = re.compile(r"https?://\S+|www\.\S+")
    text = url.sub(r'', text)
    
    html = re.compile(r"<.*?>")
    text = html.sub(r'', text)
    
    #-------------------------------------------------------------------------------------------------
    ## on retire les émojis
    
    emoji = re.compile(r"[\U0001F600-\U0001F64F"  # Emoticons
                   "\U0001F300-\U0001F5FF"  # Symbols & pictographs
                   "\U0001F680-\U0001F6FF"  # Transport & map symbols
                   "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                   "\U00002702-\U000027B0"  # Dingbats
                   "\U000024C2-\U0001F251"  # Enclosed characters
                   "]+", 
                  flags=re.UNICODE)
    
    text = emoji.sub(r'', text)

        
    #-------------------------------------------------------------------------------------------------    # on retire les mots communs
    
    STOPWORDS = set(stopwords.words('english'))

    text = " ".join([word for word in text.split() if word not in STOPWORDS])

    #-------------------------------------------------------------------------------------------------
    
    return text

def glove_mean(text):
    """
    Count mean vect of text using glove word vect representation
    
    Args :
        text (str) :
            string we want to vectorize
            
    Returns :
        the mean vect for the text using glove vect representation
    """
    mean_vect = []
    for i in text.split():
        if i in embed_dict.keys():
            mean_vect.append(embed_dict[i])
    return np.mean(mean_vect, axis= 0)

In [7]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
X_test = test["text"].apply(clean_data).tolist()
id_test = test["id"]

X_train = train["text"].apply(clean_data).tolist()
Y_train = train["target"].tolist()

# TFIDF

TF-IDF (Term Frequency-Inverse Document Frequency): this is a widely-used method that calculates a score for each term by combining a value that depends on its frequency of appearance in a text (term frequency) and a second value that depends on its appearance in all texts. The advantages are that the method is simple to implement and therefore effective, filters out common terms by lowering their scores, and highlights important rare terms. However, sensitivity to rare terms can exaggerate the score, semantic relations are not taken into account and the method loses effectiveness on long texts.

In [9]:
tfidf_vectorizer = TfidfVectorizer() 

tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

In [10]:
x_train, x_val, y_train, y_val = train_test_split(tfidf_train_vectors,
                                                  Y_train,
                                                  test_size = 0.3, random_state = 42)
print(x_train.shape)

(5329, 21445)


# Classification

In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

XGB = XGBClassifier()
    

XGB.fit(x_train, y_train)

predictions_xgb = XGB.predict(x_val)

print("Accuracy : ",  accuracy_score(y_val, predictions_xgb))

Accuracy :  0.7666374781085814


# Bagging

In [12]:
from sklearn.ensemble import BaggingClassifier

def bag_model(model, x_train, y_train, x_val, y_val):
    bag = BaggingClassifier(estimator=model,
                            n_estimators=10, 
                            random_state=0).fit(x_train, y_train)

    predictions_bag = bag.predict(x_val)

    print("accuracy bag: ",  accuracy_score(y_val, predictions_bag))

    # Utiliser la validation croisée pour évaluer la performance du modèle
    scores = cross_val_score(bag, x_train, y_train, cv=5, scoring='accuracy')
    print("Cross-validation accuracy scores: ", scores)
    print("Mean cross-validation accuracy: ", scores.mean())
    return bag

In [13]:
bag_xgb = bag_model(XGBClassifier(random_state=42), x_train, y_train, x_val, y_val)

accuracy bag:  0.7670753064798599
Cross-validation accuracy scores:  [0.76641651 0.7804878  0.76172608 0.75984991 0.76901408]
Mean cross-validation accuracy:  0.7674988769389318


In [14]:
bag_cat = bag_model(CatBoostClassifier(random_state=42, verbose=0), x_train, y_train, x_val, y_val)

accuracy bag:  0.7727670753064798
Cross-validation accuracy scores:  [0.7673546  0.7879925  0.75984991 0.76547842 0.76338028]
Mean cross-validation accuracy:  0.7688111407657956


# Voting

In [15]:
xgb = XGBClassifier(random_state=42)
cat = CatBoostClassifier(random_state=42, verbose=0)
lgb = LGBMClassifier(random_state = 42, verbose=-1)

def voting_models(models, x_train, y_train, x_val, y_val):
    
    voting = VotingClassifier(estimators=models,
                                    voting='soft')

    voting.fit(x_train, y_train)

    predictions_boost = voting.predict(x_val)

    print("accuracy boost: ",  accuracy_score(y_val, predictions_boost))

    # Utiliser la validation croisée pour évaluer la performance du modèle
    scores = cross_val_score(voting, x_train, y_train, cv=5, scoring='accuracy')
    print("Cross-validation accuracy scores: ", scores)
    print("Mean cross-validation accuracy: ", scores.mean())
    
    return voting

models = [('xgb', xgb), ('cat', cat), ('lgb', lgb)]

vot = voting_models(models, x_train, y_train, x_val, y_val)

accuracy boost:  0.7732049036777583
Cross-validation accuracy scores:  [0.77204503 0.78705441 0.76454034 0.76078799 0.76525822]
Mean cross-validation accuracy:  0.7699371966634077


# Submission

In [16]:
def submission(model, test, Id):
    pred = model.predict(test)

    submission = pd.DataFrame({'id': Id,
                               'target': pred})

    # Save submission to a CSV file
    submission.to_csv('submission.csv', index=False)
    
submission(vot, tfidf_test_vectors, id_test)