## Importing main libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import os

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Loading training dataset and viewing dataset

In [22]:
train_path = os.path.join('..','data','raw','train.csv')
test_path = os.path.join('..','data','raw','test.csv')

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(train_path)

In [4]:
train = train.drop(['keyword', 'location', 'id'], axis=1)
test = test.drop(['keyword', 'location', 'id'], axis=1)

In [23]:
def load_datasets():
    train_path = os.path.join('..','data','raw','train.csv')
    test_path = os.path.join('..','data','raw','test.csv')
    train = pd.read_csv(train_path)
    test = pd.read_csv(train_path)
    train = train.drop(['keyword', 'location', 'id'], axis=1)
    test = test.drop(['keyword', 'location', 'id'], axis=1)
    return train, test

In [8]:
train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
test.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
def RemoveStopWords(texto):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palavras = [i for i in texto.split() if not i in stopwords]
    return (" ".join(palavras))

In [12]:
train['text'] = [RemoveStopWords(i) for i in train['text']]

# Aplicando no teste
test['text'] = [RemoveStopWords(i) for i in test['text']]

train[['text']][:10]

Unnamed: 0,text
0,Our Deeds Reason #earthquake May ALLAH Forgive us
1,Forest fire near La Ronge Sask. Canada
2,All residents asked 'shelter place' notified o...
3,"13,000 people receive #wildfires evacuation or..."
4,Just got sent photo Ruby #Alaska smoke #wildfi...
5,#RockyFire Update => California Hwy. 20 closed...
6,#flood #disaster Heavy rain causes flash flood...
7,I'm top hill I see fire woods...
8,There's emergency evacuation happening buildin...
9,I'm afraid tornado coming area...


## Cleaning Text

In [13]:
def Remove_carac(text):
    text = text.str.lower() 
    text = text.str.replace(r"\#","") 
    text = text.str.replace(r"http\S+","")  
    text = text.str.replace(r"@","")
    text = text.str.replace(r"[^a-zA-Z#]", " ")
    text = text.str.replace("\s{2,}", "")
    return text

In [16]:
train['clean_text'] = Remove_carac(train['text'])

# Aplicando no teste
test['clean_text'] = Remove_carac(test['text'])

train.head()

Unnamed: 0,text,target,clean_text
0,Our Deeds Reason #earthquake May ALLAH Forgive us,1,our deeds reason earthquake may allah forgive us
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge saskcanada
2,All residents asked 'shelter place' notified o...,1,all residents askedshelter placenotified offic...
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...
4,Just got sent photo Ruby #Alaska smoke #wildfi...,1,just got sent photo ruby alaska smoke wildfire...


## Lemmatisation

In [17]:
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

def Lemmatization(texto):
    palavras = []
    for w in texto.split():
        palavras.append(lem.lemmatize(w))
    return (" ".join(palavras))

In [19]:
train['clean_text'] = [Lemmatization(f) for f in train['clean_text']]

# Aplicando no teste
test['clean_text'] = [Lemmatization(f) for f in test['clean_text']]

train.head()

Unnamed: 0,text,target,clean_text
0,Our Deeds Reason #earthquake May ALLAH Forgive us,1,our deed reason earthquake may allah forgive u
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge saskcanada
2,All residents asked 'shelter place' notified o...,1,all resident askedshelter placenotified office...
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,Just got sent photo Ruby #Alaska smoke #wildfi...,1,just got sent photo ruby alaska smoke wildfire...


## Stemming

In [20]:
from nltk.stem.snowball import SnowballStemmer

def Stemming(texto):
    stemmer = SnowballStemmer(language='english')
    palavras = []
    for w in texto.split():
        palavras.append(stemmer.stem(w))
    return (" ".join(palavras))

In [21]:
train['clean_text'] = [Stemming(t) for t in train['clean_text']]

# Aplicando no teste
test['clean_text'] = [Stemming(t) for t in test['clean_text']]

train.head()

Unnamed: 0,text,target,clean_text
0,Our Deeds Reason #earthquake May ALLAH Forgive us,1,our deed reason earthquak may allah forgiv u
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong saskcanada
2,All residents asked 'shelter place' notified o...,1,all resid askedshelt placenotifi officersno ev...
3,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california
4,Just got sent photo Ruby #Alaska smoke #wildfi...,1,just got sent photo rubi alaska smoke wildfir ...


In [24]:
train, test = load_datasets()

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
import tqdm

ImportError: cannot import name 'SAVE_STATE_WARNING' from 'torch.optim.lr_scheduler' (/home/slooth/anaconda3/lib/python3.8/site-packages/torch/optim/lr_scheduler.py)

In [None]:
# using roBERTa
roberta_weights = 'roberta-base'
roberta_model = transformers.RobertaModel.from_pretrained(roberta_weights).to(device)
roberta_token = transformers.RobertaTokenizer.from_pretrained(roberta_weights)

## Word Embeddings

## Word2Verc