In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer

# nltk.download('all')
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def RemoveStopWords(texto):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palavras = [i for i in texto.split() if not i in stopwords]
    return (" ".join(palavras))

def clean_text(text):
    text = text.str.lower() 
    text = text.str.replace(r"\#","") 
    text = text.str.replace(r"http\S+","")  
    text = text.str.replace(r"@","")
    text = text.str.replace(r"[^a-zA-Z#]", " ")
    text = text.str.replace("\s{2,}", "")
    return text

def preprocess(text, stopwords=stopwords.words('english')):
    """Conditional preprocessing on our text unique to our task."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
    text = pattern.sub('', text)

    # Remove words in paranthesis
    text = re.sub(r'\([^)]*\)', '', text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric chars
    text = re.sub(' +', ' ', text)  # remove multiple spaces
    text = text.strip()

    return text

def tokenize_tweet(x):
    tokeniser = TreebankWordTokenizer()
    tokens = tokeniser.tokenize(x)
    return tokens

def load_datasets():
    train_path = os.path.join('..','data','raw','train.csv')
    test_path = os.path.join('..','data','raw','test.csv')
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train = train.drop(['keyword', 'location', 'id'], axis=1)
    test = test.drop(['keyword', 'location', 'id'], axis=1)
        
    train['text_clean'] = [RemoveStopWords(i) for i in train['text']]
    train['text_clean'] = clean_text(train['text_clean'])
    train['text_token'] = [tokenize_tweet(i) for i in train['text_clean']]
    train = train[['text','text_clean','text_token','target']]
    
    test['text_clean'] = [RemoveStopWords(i) for i in test['text']]
    test['text_clean'] = clean_text(test['text_clean'])
    test['text_token'] = [tokenize_tweet(i) for i in test['text_clean']]
    test = test[['text','text_clean','text_token']]
    
    return train, test

In [3]:
train, test = load_datasets()

In [4]:
print(train.shape)
train.head()

(7613, 4)


Unnamed: 0,text,text_clean,text_token,target
0,Our Deeds are the Reason of this #earthquake M...,our deeds reason earthquake may allah forgive us,"[our, deeds, reason, earthquake, may, allah, f...",1
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge saskcanada,"[forest, fire, near, la, ronge, saskcanada]",1
2,All residents asked to 'shelter in place' are ...,all residents askedshelter placenotified offic...,"[all, residents, askedshelter, placenotified, ...",1
3,"13,000 people receive #wildfires evacuation or...",people receive wildfires evacuation orders cal...,"[people, receive, wildfires, evacuation, order...",1
4,Just got sent this photo from Ruby #Alaska as ...,just got sent photo ruby alaska smoke wildfire...,"[just, got, sent, photo, ruby, alaska, smoke, ...",1


In [5]:
test.head()

Unnamed: 0,text,text_clean,text_token
0,Just happened a terrible car crash,just happened terrible car crash,"[just, happened, terrible, car, crash]"
1,"Heard about #earthquake is different cities, s...",heard earthquake different citiesstay safe eve...,"[heard, earthquake, different, citiesstay, saf..."
2,"there is a forest fire at spot pond, geese are...",forest fire spot pondgeese fleeing across stre...,"[forest, fire, spot, pondgeese, fleeing, acros..."
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lightingspokane wildfires,"[apocalypse, lightingspokane, wildfires]"
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor killschina taiwan,"[typhoon, soudelor, killschina, taiwan]"


TFIDF vector transformer

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC 

from sklearn.metrics import classification_report

X = train.text_clean
y = train.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = text_clf = Pipeline(
    [
        ('tfidf', TfidfTransformer()),
        ('standardscaler', StandardScaler()),
        ('svc', SVC(gamma='auto'))
    ]
)

text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
classification_report(y_test, predicted)

ValueError: could not convert string to float: 'photopostapocalypticflimflamprodding around rubble'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
corpus = train.text_clean.to_list()

vectorizer = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_transformer = vectorizer.fit_transform(corpus)
print(tfidf_transformer.shape)

target = train.target.values.reshape(-1,1)
target

In [None]:
%%time
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(metrics.classification_report(y_test, predicted))

In [None]:
X_train