# <ins>**I. Import Libraries**</ins>
1. Install the necessary packages in the environment.
2. Import Python Packages.


In [1]:
!pip install autocorrect
!pip install tqdm
!pip install nltk

# Run to install nltk packages.
import nltk
nltk.download("stopwords")
nltk.download("punkt")

Collecting nltk
  Downloading nltk-3.5.zip (1.4 MB)
Collecting click
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
Collecting joblib
  Downloading joblib-0.16.0-py3-none-any.whl (300 kB)
Collecting regex
  Downloading regex-2020.7.14-cp38-cp38-win_amd64.whl (264 kB)
Using legacy 'setup.py install' for nltk, since package 'wheel' is not installed.
Installing collected packages: click, joblib, regex, nltk
    Running setup.py install for nltk: started
    Running setup.py install for nltk: finished with status 'done'
Successfully installed click-7.1.2 joblib-0.16.0 nltk-3.5 regex-2020.7.14
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import urllib
import re

from autocorrect import Speller
from collections import OrderedDict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from textblob import TextBlob


# **II. <ins>Load the train dataset**</ins>
1. Load the datasets needed for the projects.
2. Displays basic information about the dataset.

In [2]:
main_path = "Dataset/"

# Import dataset
train = pd.read_csv(main_path + "train.csv")
train = train.fillna("")

# Import abbreviation dataframe
df_abbreviation = pd.read_csv(main_path + "abbreviation_cleaned.txt", 
                              delimiter=";")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7613 non-null   object
 2   location  7613 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
x = np.random.randint(1, len(train["id"]))
train[x:x+5]

Unnamed: 0,id,keyword,location,text,target
2292,3287,demolish,London,I could demolish this right now! https://t.co/...,0
2293,3288,demolish,"Napa, CA",Postal Service agrees to sell not demolish dow...,0
2294,3289,demolish,"Lagos, Nigeria",[News Update] | Enugu Government to demolish i...,0
2295,3292,demolish,,I have completed the quest 'Demolish 5 Murlo.....,0
2296,3295,demolish,KOLKATA,@Jolly_Jinu you said they are terrorist becaus...,0


# III. <ins>**Text Mining**</ins>
1. Display the frequency of targets.
2. "Text Mining" class used to clean text.
3. Frequency analysis of keyword.
4. Frequency analysis of location.
5. Tweets normalization process.
6. Analyse the sentiment of tweet.


## 1. Display the frequency of targets.


In [5]:
# Analyse the frequence of veridict tweets.
# 0: Fake disaster, 1: Reak disaster
target_frequence = list(train["target"].value_counts())

# Display pie chart of the targets frequency.
pie_target = go.Figure(data=go.Pie(labels=["Fake disaster", "Real disaster"],
                                   values=target_frequence,
                                   marker_colors=["EF553B", "#636EFA"]))
pie_target.update_layout(title="Pie chart of targets frequency.")
pie_target.show()

r_disaster = target_frequence[1]/sum(target_frequence)
print("Real disaster represent (target: 1) {:.0%} of tweets.".format(r_disaster))

Real disaster represent (target: 1) 43% of tweets.


## 2. "Text Mining" class used to clean text.


In [6]:
class TextMining:
    def __init__(self, df=None, df_abbreviation=None):
        """
        TextMining is Python packages, containing three method, used to help
        clean up text.

        Args:
            - df: The dataframe containing the text to be cleaned.
            - df_abbreviation: The data frame used to replace abbreviations with
            their equivalents.
        """
        self.dataframe = df
        self.df_abbreviation = df_abbreviation
        self.stop_words = set(stopwords.words("english"))

        self.spell_checker = Speller()
        self.stemmer = PorterStemmer()

        self.pattern_flatten = re.compile("\w+")
        self.pattern_identical_letters = re.compile(r"(.)\1{2,}")

    def remove_sequences(self, word:str):
        """
        Remove the sequences of the same letter in a word. Then, check the 
        spelling of the word.

        Arg:
            - word (str): The word containing a sequence of the same letter.

        
        Returns:
            string: New word without a sequence of same letter.
        
        Example:
            >>> word = "Peopllle"
            >>> miner = TextMining()
            >>> word = miner.remove_sequences(word)
            >>> print(word)
            people
        """
        word = word.lower()
        
        # If a word contains a sequence with more than three identical letters,
        # then this sequence is deleted.
        word = self.pattern_identical_letters.sub(r"\1\1", word)
        
        # Check the spelling of the word
        word = self.spell_checker(word) 
        
        return word
    
    def cleaner(self, column:str):
        """
        Cleaner method is a tool to normalized the text.

        Processus of cleaning:
            - Remove URL.
            - Tokenize the sentence.
            - Removes all characters that are neither a letter nor a number.
            - Remove english stop words.
            - Replace abbreviations with their equivalents.
            - Word stem process.

        Args:
            - column (str): The column in the dataframe containing the texts 
            to normalized.

        Return:
            New list containing clean texts.
        
        Example:
            >>> df = pd.DataFrame(data={"Tweets":["My car is so faaaassst",
                                  "Check out the latest Python3 news at http://python.org.", 
                                  "#Summer it's toooooo hot 🥵"], 
                        "Locations":["CA", "New York", "London"]})

            >>> df_abbreviation = pd.read_csv(main_path + "abbreviation_cleaned.txt", 
            delimiter=";")

            >>> miner = TextMining(df, df_abbreviation)
            >>> df["Tweets"] = miner.cleaner("Tweets")
            >>> df["Locations"] = miner.cleaner("Locations")

            >>> df

                Tweets                          Locations
            0	[car, fast]	                    [californ]
            1	[check, latest, python3, news]	[new, york]
            2	[summer, hot]	                [london]
        """
        # List containing the standardized text.
        filtered_sentences = []

        with tqdm(total=len(self.dataframe[column]), desc="Text Cleaning") as pgbar:
            for sentence in self.dataframe[column]:
                # List containing the standardized sentence.
                token_sentence = []
                # Remove http url.
                sentence = re.sub(r"http\S+", "", sentence)
                # Divide each sentence into words.
                sentence = word_tokenize(sentence)

                for word in sentence:
                    # Remove special characters.
                    word = re.sub("[^a-zA-Z0-9]+", "", word)
                    
                    # Remove sequecences of identical letters.
                    word = self.remove_sequences(word)

                    # Remove all stopword.
                    if word not in self.stop_words:

                        # Replace abbreviation with the right word.
                        if word in self.df_abbreviation["abbreviation"].values:
                            word = df_abbreviation[df_abbreviation["abbreviation"]==word]
                            word = word.word.to_string(index=False).lstrip()
                            word = word_tokenize(word)
                            # Delete the quotes.
                            word = str(word)[1:-1]
                            
                        # Abbreviations have already been stemmed.
                        else:
                            word = self.stemmer.stem(word)

                        # Add the cleaned word, as token, in the list containing 
                        # other word of the sentence.
                        if word:
                            token_sentence.append(word)

                # Checks that the token isn't null.
                if token_sentence:
                    # Flatten 2d list to 1d list.
                    token_sentence = self.pattern_flatten.findall(str(token_sentence))
                    filtered_sentences.append(token_sentence)
                
                else:
                    filtered_sentences.append("")

                # Update the progress bar.
                pgbar.update(1)

        return filtered_sentences

    def revome_duplicate_tokens(self, column):
        """
        Find duplicate tokens in sentence and delete one.

        Args:
            - column (str): The column in which duplicate tokens should 
            be removed.
        
        Return :
            New list without duplicate tokens.
        
        Example:
            >>> df = pd.DataFrame(data={"Token":[["fast", "car", "fast"],
                                            ["check", "latest", "python3", "python3", "news"], 
                                            ["summer", "hot"]]})

            >>> miner = TextMining(df)
            >>> df["Token"] = miner.revome_duplicate_tokens("Token")
            >>> df

                Token
            0	[fast, car]
            1	[check, latest, python3, news]
            2	[summer, hot]
        """
        filtered_tokens = []

        with tqdm(total=len(self.dataframe[column]), desc="Removal of duplicate tokens") as pgbar:
            for row in self.dataframe[column]:
                # Remove duplicate token while keeping order.
                row = list(dict.fromkeys(row))
                filtered_tokens.append(row)
                
                # Update the progress bar.
                pgbar.update(1)

        return filtered_tokens

In [7]:
# Define the miner tools.
miner = TextMining(train, df_abbreviation)

## 3. Frequency analysis of keyword.

In [8]:
# Unescaped keyword
unescaped_keyword = []

for word in train["keyword"]:
    word = str(word)
    unescaped_keyword.append(urllib.parse.unquote(word))

train["keyword_cleaned"] = unescaped_keyword

In [9]:
# Text processing of keyword location features.
train["keyword_cleaned_token"] = miner.cleaner("keyword_cleaned")

# Remove repeating tokens. ex: unit state unit stat -> unit stat
train["keyword_cleaned_token"] = miner.revome_duplicate_tokens("keyword_cleaned_token")

# Join tokens to remake sentences.
train["keyword_cleaned"] = train["keyword_cleaned_token"].apply(" ".join)

Text Cleaning: 100%|██████████| 7613/7613 [00:14<00:00, 516.84it/s]
Removal of duplicate tokens: 100%|██████████| 7613/7613 [00:00<00:00, 509465.13it/s]


In [10]:
# Analysis of keyword frequence.
fig_keyword = px.histogram(train, x="keyword_cleaned")
fig_keyword.show()

print("There are %s different keywords." % len(train["keyword"].value_counts()))

There are 222 different keywords.


## 4. Frequency analysis of location.

In [11]:
# Text processing of keyword location features.
train["location_cleaned_token"] = miner.cleaner("location")

# Remove repeating tokens. ex: unit state unit stat -> unit stat
train["location_cleaned_token"] = miner.revome_duplicate_tokens("location_cleaned_token")

# Join tokens to remake sentences.
train["location_cleaned"] = train["location_cleaned_token"].apply(" ".join)

Text Cleaning: 100%|██████████| 7613/7613 [02:50<00:00, 44.55it/s]
Removal of duplicate tokens: 100%|██████████| 7613/7613 [00:00<00:00, 477340.81it/s]


In [12]:
column = "location_cleaned"

# Get the frequence of all locations.
location_frequence = dict(train[column].value_counts())

# List used to analyse the frequence of targets by location.
location_corr = pd.DataFrame(columns=["location", "frequency 0", "frequency 1",
                                      "mean 0"])

# Get the location group in the train dataset.
location_group = train.groupby(column)

for address in location_frequence:
    # Replace the addresses where the frequency is too low to be representative 
    # with a value "nan".
    if location_frequence[address] < 20:
        train[train[column] == address] = ""

    else:
        # Get the target frequency value and add it to the dataframe.
        frequency = location_group.get_group(address).target.value_counts().tolist()
        
        # Make sure the address contains 2 values. Otherwise, add one value equal of
        # 0 to the frequency of target 1. ex : freq target 0 = x, 
        #                                      freq target 1 = 0.
        if len(frequency) == 1:
            frequency.append(0)

        # Add row to dataframe containing address, target frequencies 
        # and the average frequency of the targets.
        location_corr = location_corr.append({"location":address,
                                              "frequency":sum(frequency),
                                              "frequency 0":frequency[0],
                                              "frequency 1":frequency[1]},
                                             ignore_index=True)

In [13]:
# Start with index 1 because the index 0 represent null locations. 
id = 1

# Create figure 
fig_freq_location = go.Figure()
fig_freq_location.add_trace(go.Histogram(histfunc="sum",
                                         x=location_corr["location"][id:],
                                         y=location_corr["frequency"][id:], 
                                         name="Total", opacity=0.7,
                                         marker_color="#00CC96"))

fig_freq_location.add_trace(go.Histogram(histfunc="sum",
                                         x=location_corr["location"][id:],
                                         y=location_corr["frequency 0"][id:], 
                                         name="Real disaster", opacity=0.7,
                                         marker_color="#636EFA"))

fig_freq_location.add_trace(go.Histogram(histfunc="sum",
                                         x=location_corr["location"][id:],
                                         y=location_corr["frequency 1"][id:], 
                                         name="Real disaster", opacity=0.7,
                                         marker_color="#EF553B"))

fig_freq_location.update_layout(title="Frequencies of targets by address.",
                                xaxis_title="Address",
                                yaxis_title="Frequence")
fig_freq_location.show()

## 5. Tweets normalization process.



In [14]:
train["text_cleaned_token"] = miner.cleaner("text")

# Join tokens to remake sentences.
train["text_cleaned"] = [" ".join(sentence) for sentence in train["text_cleaned_token"]]

Text Cleaning: 100%|██████████| 7613/7613 [07:42<00:00, 16.44it/s]


## 6. Analyse the sentiment of tweets.

In [15]:
sentiment_tweets = [TextBlob(tweet) for tweet in train["text_cleaned"]]
train["sentiment_value"] = [round(tweet.sentiment.polarity, 1) for tweet in sentiment_tweets]

In [16]:
for i in range(0,10):
    print(sentiment_tweets[i], train["sentiment_value"][i], train["target"][i])

deed reason earthquak may allah forgiv us 0.0 1
forest fire near louisian rang ask canada 0.1 1
resid ask shelter place notifi offic evacu shelter place order expect 0.0 1
1300 peopl receiv wildfir evacu order california 0.0 1
got sent photo rubi alaska smoke wildfir pour school 0.0 1
rockyfir updat california hwi 20 close direct due lake counti fire caffr wildfir -0.0 1
flood disast heavi rain caus flash flood street manit colorado spring area 0.0 1
top hill see fire wood 0.5 1
emerg evacu happen build across street 0.0 1
afraid tornado come area -0.6 1


In [17]:
fig_keyword = px.histogram(train, x="sentiment_value")

fig_keyword.show()

## 7. Download the new dataframe cleaned.

In [None]:
# Download the new dataframe cleaned.
train.to_csv(main_path + "cleaned_train.csv", index=False)