# MICROSOFT LEARN LOCATION MENTION RECOGNITION CHALLENGE

## Checking Working directory

In [18]:
import os
print(os.getcwd())
#os.chdir('path_to_directory_where_file_is')

C:\Users\lamem\OneDrive\Documents\GHD\Microsoft-Learn-Location-Mention-Recognition-Challenge


### Installing dependencies

In [19]:
import sys

In [20]:
print(f"Python version: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")

conda_env = os.getenv('CONDA_DEFAULT_ENV')
print(f"Active Conda environment: {conda_env}")

Python version: 3.9.19
Active Conda environment: mllmrc


In [21]:
pip install contractions nltk textblob fasttext-wheel --q

Note: you may need to restart the kernel to use updated packages.


## Import library

In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import fasttext
import re
from bs4 import BeautifulSoup
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from textblob import TextBlob
import string
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lamem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lamem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lamem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lamem\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lamem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Load Data

In [23]:
df = pd.read_csv("data/Train_1.csv")
test = pd.read_csv("data/Test.csv")
sup = test.copy()
df.head()

Unnamed: 0,tweet_id,text,location
0,ID_1001136212718088192,,EllicottCity
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",Maryland
2,ID_1001136950345109504,State of emergency declared for Maryland flood...,Maryland
3,ID_1001137334056833024,Other parts of Maryland also saw significant d...,Baltimore Maryland
4,ID_1001138374923579392,"Catastrophic Flooding Slams Ellicott City, Mar...",Ellicott City Maryland


In [24]:
test.head()

Unnamed: 0,tweet_id,text
0,ID_1001154804658286592,What is happening to the infrastructure in New...
1,ID_1001155505459486720,SOLDER MISSING IN FLOOD.. PRAY FOR EDDISON HER...
2,ID_1001155756371136512,RT @TIME: Police searching for missing person ...
3,ID_1001159445194399744,Flash Flood Tears Through Maryland Town For Se...
4,ID_1001164907587538944,Ellicott City #FLOODING Pictures: Maryland Gov...


In [25]:
df["text"][2] , df["location"][2]

('State of emergency declared for Maryland flooding:  via @YouTube',
 'Maryland')

In [26]:
df["text"][4] , df["location"][4]

('Catastrophic Flooding Slams Ellicott City, Maryland; Water Rescues Reported - The Weather Channel  via @GoogleNews',
 'Ellicott City Maryland')

In [27]:
df.isnull().sum()

tweet_id        0
text        56624
location    29612
dtype: int64

In [28]:
df.dropna(inplace=True)

In [29]:
df.head()

Unnamed: 0,tweet_id,text,location
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",Maryland
2,ID_1001136950345109504,State of emergency declared for Maryland flood...,Maryland
3,ID_1001137334056833024,Other parts of Maryland also saw significant d...,Baltimore Maryland
4,ID_1001138374923579392,"Catastrophic Flooding Slams Ellicott City, Mar...",Ellicott City Maryland
5,ID_1001138377717157888,WATCH: 1 missing after flash #FLOODING devasta...,Ellicott City Maryland


In [30]:
df["location"].value_counts()

location
Mexico                               582
California                           544
Ecuador                              539
Nebraska                             495
Haiti                                424
                                    ... 
Florida Mexico Nebraska Texas          1
Chimanimani- Zimbabwe                  1
Mozambique South African Zimbabwe      1
Manicaland Mutare                      1
Israels Mexico City Thailand           1
Name: count, Length: 3448, dtype: int64

In [31]:
class TextPreprocessor:
    def __init__(self, custom_stopwords=None):
        self.stop_words = set(stopwords.words('english'))
        if custom_stopwords:
            self.stop_words.update(custom_stopwords)
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()

    def clean_text(self, text):
        if not isinstance(text, str):
            text = str(text)
        text = BeautifulSoup(text, "html.parser").get_text()
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        text = contractions.fix(text)
        return text

    def remove_urls(self, text):
        return re.sub(r'http\S+|www.\S+', '', text)

    def normalize_text(self, text):
        return ''.join([c.lower() for c in text if c not in string.punctuation])

    def tokenize_text(self, text):
        return word_tokenize(text)

    def remove_stopwords(self, words):
        return [word for word in words if word not in self.stop_words]

    def lemmatize_words(self, words):
        return [self.lemmatizer.lemmatize(word) for word in words]

    def stem_words(self, words):
        return [self.stemmer.stem(word) for word in words]

    def correct_spelling(self, text):
        return str(TextBlob(text).correct())

    def remove_non_ascii(self, words):
        return [word for word in words if word.isascii()]

    def generate_ngrams(self, words, n=2):
        return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    def preprocess_text(self, text, use_stemming=False, use_spelling_correction=False, generate_ngrams=False):
        if pd.isna(text):
            return ""
        text = self.clean_text(text)
        text = self.remove_urls(text)
        text = self.normalize_text(text)
        words = self.tokenize_text(text)
        words = self.remove_stopwords(words)
        if use_stemming:
            words = self.stem_words(words)
        else:
            words = self.lemmatize_wods(words)
        words = self.remove_non_ascii(words)
        cleaned_text = " ".join(words)
        if use_spelling_correction:
            cleaned_text = self.correct_spelling(cleaned_text)
        if generate_ngrams:
            ngrams = self.generate_ngrams(words)
            cleaned_text += " " + " ".join(ngrams)
        return cleaned_text

    def preprocess_dataframe(self, df, text_column, **kwargs):
        df['cleaned_text'] = df[text_column].apply(lambda x: self.preprocess_text(x, **kwargs))
        return df

In [32]:
preprocessor = TextPreprocessor(custom_stopwords=['custom', 'words'])
df = preprocessor.preprocess_dataframe(df, 'text', use_stemming=True, use_spelling_correction=False, generate_ngrams=False)

  text = BeautifulSoup(text, "html.parser").get_text()


In [33]:
preprocessor = TextPreprocessor(custom_stopwords=['custom', 'words'])
test = preprocessor.preprocess_dataframe(test, 'text', use_stemming=True, use_spelling_correction=False, generate_ngrams=False)

  text = BeautifulSoup(text, "html.parser").get_text()


In [34]:
df["text"][2] , df["cleaned_text"][2]

('State of emergency declared for Maryland flooding:  via @YouTube',
 'state emerg declar maryland flood via youtub')

In [35]:
df["location"].value_counts()

location
Mexico                               582
California                           544
Ecuador                              539
Nebraska                             495
Haiti                                424
                                    ... 
Florida Mexico Nebraska Texas          1
Chimanimani- Zimbabwe                  1
Mozambique South African Zimbabwe      1
Manicaland Mutare                      1
Israels Mexico City Thailand           1
Name: count, Length: 3448, dtype: int64

## Create label for locations

In [36]:
df["location"] = "__label__" + df["location"].astype(str)
df.head()

Unnamed: 0,tweet_id,text,location,cleaned_text
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",__label__Maryland,flash flood struck maryland citi sunday wash s...
2,ID_1001136950345109504,State of emergency declared for Maryland flood...,__label__Maryland,state emerg declar maryland flood via youtub
3,ID_1001137334056833024,Other parts of Maryland also saw significant d...,__label__Baltimore Maryland,part maryland also saw signific damag sunday s...
4,ID_1001138374923579392,"Catastrophic Flooding Slams Ellicott City, Mar...",__label__Ellicott City Maryland,catastroph flood slam ellicott citi maryland w...
5,ID_1001138377717157888,WATCH: 1 missing after flash #FLOODING devasta...,__label__Ellicott City Maryland,watch miss flash flood devast ellicott citi ma...


## Combine Label Location with Text

In [37]:
df["location_description"] = df["location"] + " " + df["cleaned_text"]
df.head()

Unnamed: 0,tweet_id,text,location,cleaned_text,location_description
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",__label__Maryland,flash flood struck maryland citi sunday wash s...,__label__Maryland flash flood struck maryland ...
2,ID_1001136950345109504,State of emergency declared for Maryland flood...,__label__Maryland,state emerg declar maryland flood via youtub,__label__Maryland state emerg declar maryland ...
3,ID_1001137334056833024,Other parts of Maryland also saw significant d...,__label__Baltimore Maryland,part maryland also saw signific damag sunday s...,__label__Baltimore Maryland part maryland also...
4,ID_1001138374923579392,"Catastrophic Flooding Slams Ellicott City, Mar...",__label__Ellicott City Maryland,catastroph flood slam ellicott citi maryland w...,__label__Ellicott City Maryland catastroph flo...
5,ID_1001138377717157888,WATCH: 1 missing after flash #FLOODING devasta...,__label__Ellicott City Maryland,watch miss flash flood devast ellicott citi ma...,__label__Ellicott City Maryland watch miss fla...


## Clean Combined

In [38]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

df["location_description"] = df["location_description"].apply(preprocess)
df.head()

Unnamed: 0,tweet_id,text,location,cleaned_text,location_description
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",__label__Maryland,flash flood struck maryland citi sunday wash s...,__label__maryland flash flood struck maryland ...
2,ID_1001136950345109504,State of emergency declared for Maryland flood...,__label__Maryland,state emerg declar maryland flood via youtub,__label__maryland state emerg declar maryland ...
3,ID_1001137334056833024,Other parts of Maryland also saw significant d...,__label__Baltimore Maryland,part maryland also saw signific damag sunday s...,__label__baltimore maryland part maryland also...
4,ID_1001138374923579392,"Catastrophic Flooding Slams Ellicott City, Mar...",__label__Ellicott City Maryland,catastroph flood slam ellicott citi maryland w...,__label__ellicott city maryland catastroph flo...
5,ID_1001138377717157888,WATCH: 1 missing after flash #FLOODING devasta...,__label__Ellicott City Maryland,watch miss flash flood devast ellicott citi ma...,__label__ellicott city maryland watch miss fla...


In [39]:
df["location_description"][2] , df["location"][2]

('__label__maryland state emerg declar maryland flood via youtub',
 '__label__Maryland')

## Split and Save

In [40]:
train, val = train_test_split(df, test_size=0.2)

In [41]:
train.to_csv("data/ecommerce.train", columns=["location_description"], index=False, header=False)
val.to_csv("data/ecommerce.test", columns=["location_description"], index=False, header=False)

## Train fasttext model

In [42]:
model = fasttext.train_supervised(input="data/ecommerce.train", lr=1.0, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='hs')

In [43]:
model.test("data/ecommerce.test")

(2211, 0.7105382180009046, 0.7105382180009046)

In [44]:
test.head()

Unnamed: 0,tweet_id,text,cleaned_text
0,ID_1001154804658286592,What is happening to the infrastructure in New...,happen infrastructur new england global warm m...
1,ID_1001155505459486720,SOLDER MISSING IN FLOOD.. PRAY FOR EDDISON HER...,solder miss flood pray eddison hermond pray el...
2,ID_1001155756371136512,RT @TIME: Police searching for missing person ...,rt time polic search miss person devast year f...
3,ID_1001159445194399744,Flash Flood Tears Through Maryland Town For Se...,flash flood tear maryland town second time two...
4,ID_1001164907587538944,Ellicott City #FLOODING Pictures: Maryland Gov...,ellicott citi flood pictur maryland governor d...


## Make Prediction

In [45]:
prediction = []
for text in test["cleaned_text"]:
    prediction.append(model.predict(text))

In [46]:
locations = [item[0][0].split('__')[-1] for item in prediction]

## Submission

In [61]:
test['prediction'] = locations
#test["prediction"][1000] = "Georgia_Florida_Alabama_North_Carolina_South_Carolina_Alaska_Hawaii_California_Iowa_Nebraska_Missouri_Kansas"
test[["tweet_id", "prediction"]].to_csv("data/submission.csv", index=False)
test["prediction"] = test["prediction"].apply(lambda x: 'A' if x == '' else x)

In [62]:
test[test["tweet_id"] == "ID_1167512265576374272"]

Unnamed: 0,tweet_id,text,cleaned_text,prediction
1170,ID_1167512265576374272,Fuel up your vehicles. Bear in mind that peopl...,fuel vehicl bear mind peopl alreadi shortag re...,A


In [52]:
test.shape , len(locations)

((2942, 4), 2942)