In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

In [4]:
train_data = pd.read_csv('/content/drive/My Drive/Parcours_IML/IML_Projet_8/Data/train.csv',sep=',')
test_data = pd.read_csv('/content/drive/My Drive/Parcours_IML/IML_Projet_8/Data/test.csv',sep=',')

# Cleaning the training data

## Deleting stop words

In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# creating a list of element to delete other than nltk.corpus.stopwords
import string
word_to_delete = ['I','The','A','The','This','A','You','My','We']
special = string.punctuation
element_to_delete = []
for element in word_to_delete:
  element_to_delete.append(element)
for element in special:
  element_to_delete.append(element)

In [7]:
to_be_cleaned = train_data.text.str.split()
def remove_stopwords(doc):
    return " ".join([word for word in doc if word not in stopwords.words('english')])
def remove_element(doc):
    return " ".join([word for word in doc if word not in element_to_delete])
first_step = []
for i in range(len(to_be_cleaned)):
  first_step.append(remove_stopwords(to_be_cleaned[i]))
second_step = []
for element in first_step:
  second_step.append(element.split())
cleaned_text = []
for i in range(len(second_step)):
  cleaned_text.append(remove_element(second_step[i]))

## Word stemmatization

In [8]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def text_stematization(data):
  text = data['text']
  for i in range(len(text)):
    for j in range(len(text[i])):
      text[i][j] = stemmer.stem(text[i][j])
  data['text']=text
  return data

In [9]:
Cleaning_df = pd.DataFrame()
Cleaning_df['text'] = pd.Series(cleaned_text).str.split()
Cleaning_df = text_stematization(Cleaning_df)
Cleaning_df.head(1)


Unnamed: 0,text
0,"[our, deed, reason, #earthquak, may, allah, fo..."


## Words Lemmatization

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [11]:
# Lematization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def text_lemmatization(data):
  text = data['text']
  for i in range(len(text)):
    for j in range(len(text[i])):
      text[i][j] = lemmatizer.lemmatize(text[i][j])
  data['text']=text
  return data

In [12]:
Cleaning_df = text_lemmatization(Cleaning_df)
Cleaning_df.head(1)

Unnamed: 0,text
0,"[our, deed, reason, #earthquak, may, allah, fo..."


## Removing URL, Emoji & Punctuation

In [13]:
def remaking_str(doc):
  return " ".join([word for word in doc])
Cleaning_df.text = Cleaning_df.text.apply(lambda x: remaking_str(x))
import re
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
Cleaning_df.text=Cleaning_df.text.apply(lambda x : remove_URL(x))

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

Cleaning_df.text=Cleaning_df.text.apply(lambda x : remove_emoji(x))

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)
Cleaning_df.text=Cleaning_df.text.apply(lambda x : remove_punct(x))

to check if our cleaning, we will compare the purcentage of coverage of GloVe word embedding model

## Compraring word coverage of GloVe

In [14]:
embeddings_index = dict()
f = open('/content/drive/My Drive/Parcours_IML/IML_Projet_8/Pretrained_word_embedding/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))


def build_vocab(X):
    
    tweets = X.apply(lambda s: s.split()).values      
    vocab = {}
    
    for tweet in tweets:
        for word in tweet:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1                
    return vocab


import operator
from operator import itemgetter
def check_embeddings_coverage(X, embeddings):
    
    vocab = build_vocab(X)    
    
    covered = {}
    oov = {}    
    n_covered = 0
    n_oov = 0
    
    for word in vocab:
        try:
            covered[word] = embeddings[word]
            n_covered += vocab[word]
        except:
            oov[word] = vocab[word]
            n_oov += vocab[word]
            
    vocab_coverage = len(covered) / len(vocab)
    text_coverage = (n_covered / (n_covered + n_oov))
    
    sorted_oov = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_oov, vocab_coverage, text_coverage

Loaded 400000 word vectors.


In [15]:
train_glove_oov, train_glove_vocab_coverage, train_glove_text_coverage = check_embeddings_coverage(train_data['text'], embeddings_index)
print('Before cleaning, GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set'.format(train_glove_vocab_coverage, train_glove_text_coverage))

Before cleaning, GloVe Embeddings cover 23.64% of vocabulary and 56.70% of text in Training Set


In [16]:
train_glove_oov, train_glove_vocab_coverage, train_glove_text_coverage = check_embeddings_coverage(Cleaning_df['text'], embeddings_index)
print('After cleaning, GloVe Embeddings cover {:.2%} of vocabulary and {:.2%} of text in Training Set'.format(train_glove_vocab_coverage, train_glove_text_coverage))

After cleaning, GloVe Embeddings cover 54.82% of vocabulary and 77.36% of text in Training Set


now let's save the cleaned data for next step 

# Saving the cleaned training data

In [17]:
cleaned_data = pd.DataFrame()
cleaned_data['text'] = Cleaning_df.text
cleaned_data['keyword']= train_data.keyword
cleaned_data['keyword'] = cleaned_data['keyword'].fillna('no_keyword')
cleaned_data['target'] = train_data.target
cleaned_data.to_csv(r'/content/drive/My Drive/Parcours_IML/IML_Projet_8/Data/cleaned_dataset.csv', index = False)

# Cleaning the test_data

In [18]:
to_be_cleaned = test_data.text.str.split()
def remove_stopwords(doc):
    return " ".join([word for word in doc if word not in stopwords.words('english')])
def remove_element(doc):
    return " ".join([word for word in doc if word not in element_to_delete])
first_step = []
for i in range(len(to_be_cleaned)):
  first_step.append(remove_stopwords(to_be_cleaned[i]))
second_step = []
for element in first_step:
  second_step.append(element.split())
cleaned_text = []
for i in range(len(second_step)):
  cleaned_text.append(remove_element(second_step[i]))

In [19]:
Cleaning_df = pd.DataFrame()
Cleaning_df['text'] = pd.Series(cleaned_text).str.split()
Cleaning_df = text_stematization(Cleaning_df)
Cleaning_df = text_lemmatization(Cleaning_df)

In [20]:
def remaking_str(doc):
  return " ".join([word for word in doc])
Cleaning_df.text = Cleaning_df.text.apply(lambda x: remaking_str(x))
import re
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
Cleaning_df.text=Cleaning_df.text.apply(lambda x : remove_URL(x))

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

Cleaning_df.text=Cleaning_df.text.apply(lambda x : remove_emoji(x))

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)
Cleaning_df.text=Cleaning_df.text.apply(lambda x : remove_punct(x))

In [21]:
cleaned_data = pd.DataFrame()
cleaned_data['text'] = Cleaning_df.text
cleaned_data.to_csv(r'/content/drive/My Drive/Parcours_IML/IML_Projet_8/Data/cleaned_validation_dataset.csv', index = False)