In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer

# nltk.download('all')
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def RemoveStopWords(texto):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palavras = [i for i in texto.split() if not i in stopwords]
    return (" ".join(palavras))

def clean_text(text):
    text = text.str.lower() 
    text = text.str.replace(r"\#","") 
    text = text.str.replace(r"http\S+","")  
    text = text.str.replace(r"@","")
    text = text.str.replace(r"[^a-zA-Z#]", " ")
    text = text.str.replace("\s{2,}", "")
    return text

def preprocess(text, stopwords=stopwords.words('english')):
    """Conditional preprocessing on our text unique to our task."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
    text = pattern.sub('', text)

    # Remove words in paranthesis
    text = re.sub(r'\([^)]*\)', '', text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric chars
    text = re.sub(' +', ' ', text)  # remove multiple spaces
    text = text.strip()

    return text

def tokenize_tweet(x):
    tokeniser = TreebankWordTokenizer()
    tokens = tokeniser.tokenize(x)
    return tokens

def load_datasets():
    train_path = os.path.join('..','data','raw','train.csv')
    test_path = os.path.join('..','data','raw','test.csv')
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train = train.drop(['keyword', 'location', 'id'], axis=1)
    test = test.drop(['keyword', 'location', 'id'], axis=1)
        
    train['text_clean'] = [RemoveStopWords(i) for i in train['text']]
    train['text_clean'] = clean_text(train['text_clean'])
    train['text_token'] = [tokenize_tweet(i) for i in train['text_clean']]
    train = train[['text','text_clean','text_token','target']]
    
    test['text_clean'] = [RemoveStopWords(i) for i in test['text']]
    test['text_clean'] = clean_text(test['text_clean'])
    test['text_token'] = [tokenize_tweet(i) for i in test['text_clean']]
    test = test[['text','text_clean','text_token']]
    
    return train, test

In [3]:
train, test = load_datasets()

In [4]:
print(train.shape)
train.head()

(7613, 4)


Unnamed: 0,text,text_clean,text_token,target
0,Our Deeds are the Reason of this #earthquake M...,our deeds reason earthquake may allah forgive us,"[our, deeds, reason, earthquake, may, allah, f...",1
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge saskcanada,"[forest, fire, near, la, ronge, saskcanada]",1
2,All residents asked to 'shelter in place' are ...,all residents askedshelter placenotified offic...,"[all, residents, askedshelter, placenotified, ...",1
3,"13,000 people receive #wildfires evacuation or...",people receive wildfires evacuation orders cal...,"[people, receive, wildfires, evacuation, order...",1
4,Just got sent this photo from Ruby #Alaska as ...,just got sent photo ruby alaska smoke wildfire...,"[just, got, sent, photo, ruby, alaska, smoke, ...",1


In [5]:
test.head()

Unnamed: 0,text,text_clean,text_token
0,Just happened a terrible car crash,just happened terrible car crash,"[just, happened, terrible, car, crash]"
1,"Heard about #earthquake is different cities, s...",heard earthquake different citiesstay safe eve...,"[heard, earthquake, different, citiesstay, saf..."
2,"there is a forest fire at spot pond, geese are...",forest fire spot pondgeese fleeing across stre...,"[forest, fire, spot, pondgeese, fleeing, acros..."
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lightingspokane wildfires,"[apocalypse, lightingspokane, wildfires]"
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor killschina taiwan,"[typhoon, soudelor, killschina, taiwan]"


TFIDF vector transformer with svm binary classifier

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
X = train.text_clean
y = train.target

td = TfidfVectorizer(max_features = train.shape[0])
X = td.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [42:03<00:00, 87.03s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BernoulliNB,0.79,0.77,0.77,0.78,3.27
RandomForestClassifier,0.77,0.76,0.76,0.77,112.62
NearestCentroid,0.77,0.75,0.75,0.77,3.01
ExtraTreesClassifier,0.75,0.74,0.74,0.75,237.44
BaggingClassifier,0.75,0.74,0.74,0.75,244.96
NuSVC,0.74,0.73,0.73,0.74,428.78
XGBClassifier,0.75,0.72,0.72,0.74,69.44
SVC,0.75,0.72,0.72,0.74,416.98
LGBMClassifier,0.73,0.71,0.71,0.72,5.26
DecisionTreeClassifier,0.71,0.7,0.7,0.71,158.51


In [8]:
# from sklearn.naive_bayes import BernoulliNB
# BernoulliNB??

In [9]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, make_scorer

def min_recall_precision(est, X, y_true, sample_weight=None):
    y_pred = est.predict(X)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    return min(recall, precision)

X = train.text_clean
y = train.target

td = TfidfVectorizer(max_features = train.shape[0])
X = td.fit_transform(X).toarray()


scaler = StandardScaler()

clf_gaus_standard = Pipeline([
        ("scale", scaler),
        ("model", BernoulliNB())
    ])

clf_gaus_pca = Pipeline([
        ("scale", scaler),
        ('reduce_dim', PCA()),
        ("model", BernoulliNB())
    ])

clf_gaus_lda = Pipeline([
        ("scale", scaler),
        ('reduce_dim', LDA()),
        ("model", BernoulliNB())
    ])

In [10]:
mod_standard = GridSearchCV(estimator=clf_gaus_standard,
                   param_grid={
                       'model__alpha':np.linspace(0.1,1,10),
                   },
                   scoring={'precision': make_scorer(precision_score), 
                             'recall': make_scorer(recall_score),
                             'min_both': min_recall_precision},
                    refit='min_both',
                    return_train_score=True,
                    cv=3,
                    n_jobs=-1)
mod_standard.fit(X, y);

pd.DataFrame(mod_standard.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,params,split0_test_precision,split1_test_precision,split2_test_precision,mean_test_precision,...,split1_test_min_both,split2_test_min_both,mean_test_min_both,std_test_min_both,rank_test_min_both,split0_train_min_both,split1_train_min_both,split2_train_min_both,mean_train_min_both,std_train_min_both
0,2.74,0.07,1.1,0.0,0.1,{'model__alpha': 0.1},0.67,0.62,0.64,0.64,...,0.61,0.64,0.63,0.01,3,0.84,0.83,0.85,0.84,0.01
1,2.68,0.04,1.14,0.06,0.2,{'model__alpha': 0.2},0.69,0.62,0.65,0.65,...,0.61,0.65,0.63,0.02,2,0.83,0.81,0.83,0.83,0.01
2,2.76,0.15,1.15,0.05,0.3,{'model__alpha': 0.30000000000000004},0.7,0.63,0.66,0.67,...,0.61,0.66,0.63,0.02,1,0.82,0.81,0.82,0.82,0.01
3,2.66,0.03,1.03,0.02,0.4,{'model__alpha': 0.4},0.72,0.64,0.67,0.68,...,0.6,0.67,0.63,0.03,4,0.81,0.8,0.82,0.81,0.01
4,2.69,0.05,1.04,0.04,0.5,{'model__alpha': 0.5},0.73,0.65,0.68,0.68,...,0.59,0.68,0.62,0.04,5,0.8,0.79,0.8,0.8,0.01
5,2.77,0.01,1.15,0.05,0.6,{'model__alpha': 0.6},0.74,0.65,0.69,0.69,...,0.57,0.69,0.62,0.05,6,0.79,0.78,0.8,0.79,0.01
6,2.89,0.08,1.21,0.07,0.7,{'model__alpha': 0.7000000000000001},0.74,0.66,0.7,0.7,...,0.56,0.7,0.61,0.06,7,0.78,0.78,0.79,0.78,0.0
7,2.99,0.03,1.12,0.03,0.8,{'model__alpha': 0.8},0.76,0.67,0.71,0.71,...,0.56,0.68,0.6,0.06,8,0.77,0.77,0.78,0.77,0.0
8,2.72,0.05,1.03,0.02,0.9,{'model__alpha': 0.9},0.76,0.68,0.71,0.72,...,0.55,0.68,0.59,0.06,9,0.76,0.75,0.77,0.76,0.01
9,2.22,0.38,0.85,0.17,1.0,{'model__alpha': 1.0},0.78,0.68,0.72,0.73,...,0.54,0.67,0.58,0.06,10,0.75,0.75,0.76,0.75,0.0


In [12]:
mod_pca = GridSearchCV(estimator=clf_gaus_pca,
                   param_grid={
                       'model__alpha':np.linspace(0.1,1,10),
                       'reduce_dim__n_components':np.arange(1,100)
                   },
                   scoring={'precision': make_scorer(precision_score), 
                             'recall': make_scorer(recall_score),
                             'min_both': min_recall_precision},
                    refit='min_both',
                    return_train_score=True,
                    cv=3,
                    n_jobs=-1)
mod_pca.fit(X, y);

pd.DataFrame(mod_pca.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_reduce_dim__n_components,params,split0_test_precision,split1_test_precision,split2_test_precision,...,split1_test_min_both,split2_test_min_both,mean_test_min_both,std_test_min_both,rank_test_min_both,split0_train_min_both,split1_train_min_both,split2_train_min_both,mean_train_min_both,std_train_min_both
0,4.82,0.01,0.81,0.02,0.10,1,"{'model__alpha': 0.1, 'reduce_dim__n_component...",0.00,0.42,0.00,...,0.08,0.00,0.03,0.04,982,0.00,0.11,0.00,0.04,0.05
1,4.74,0.09,0.88,0.01,0.10,2,"{'model__alpha': 0.1, 'reduce_dim__n_component...",0.00,0.50,0.51,...,0.50,0.16,0.22,0.21,940,0.00,0.54,0.18,0.24,0.22
2,5.15,0.16,0.94,0.05,0.10,3,"{'model__alpha': 0.1, 'reduce_dim__n_component...",0.51,0.50,0.00,...,0.26,0.00,0.13,0.11,965,0.18,0.33,0.00,0.17,0.14
3,5.50,0.08,1.00,0.02,0.10,4,"{'model__alpha': 0.1, 'reduce_dim__n_component...",0.50,0.56,0.46,...,0.20,0.08,0.18,0.07,951,0.26,0.27,0.10,0.21,0.08
4,4.83,0.05,0.82,0.03,0.10,5,"{'model__alpha': 0.1, 'reduce_dim__n_component...",0.52,0.48,0.54,...,0.26,0.28,0.24,0.05,936,0.21,0.32,0.22,0.25,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,10.85,0.05,1.40,0.09,1.00,95,"{'model__alpha': 1.0, 'reduce_dim__n_component...",0.59,0.53,0.56,...,0.53,0.56,0.56,0.02,51,0.59,0.63,0.58,0.60,0.02
986,10.61,0.24,1.16,0.21,1.00,96,"{'model__alpha': 1.0, 'reduce_dim__n_component...",0.56,0.53,0.58,...,0.53,0.58,0.56,0.02,41,0.60,0.63,0.58,0.60,0.02
987,10.56,1.10,1.15,0.07,1.00,97,"{'model__alpha': 1.0, 'reduce_dim__n_component...",0.59,0.55,0.56,...,0.55,0.56,0.57,0.02,6,0.60,0.64,0.59,0.61,0.02
988,12.45,0.30,1.29,0.03,1.00,98,"{'model__alpha': 1.0, 'reduce_dim__n_component...",0.58,0.53,0.56,...,0.53,0.56,0.55,0.01,167,0.59,0.63,0.59,0.60,0.02


In [14]:
mod_lda = GridSearchCV(estimator=clf_gaus_lda,
                   param_grid={
                       'model__alpha':np.linspace(0.1,1,10),
                       'reduce_dim__n_components':[1]
                   },
                   scoring={'precision': make_scorer(precision_score), 
                             'recall': make_scorer(recall_score),
                             'min_both': min_recall_precision},
                    refit='min_both',
                    return_train_score=True,
                    cv=3,
                    n_jobs=-1)
mod_lda.fit(X, y);

pd.DataFrame(mod_lda.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_reduce_dim__n_components,params,split0_test_precision,split1_test_precision,split2_test_precision,...,split1_test_min_both,split2_test_min_both,mean_test_min_both,std_test_min_both,rank_test_min_both,split0_train_min_both,split1_train_min_both,split2_train_min_both,mean_train_min_both,std_train_min_both
0,360.31,5.09,1.1,0.17,0.1,1,"{'model__alpha': 0.1, 'reduce_dim__n_component...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0
1,334.96,22.21,0.94,0.1,0.2,1,"{'model__alpha': 0.2, 'reduce_dim__n_component...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0
2,314.03,18.39,1.01,0.18,0.3,1,"{'model__alpha': 0.30000000000000004, 'reduce_...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0
3,290.38,0.17,0.91,0.03,0.4,1,"{'model__alpha': 0.4, 'reduce_dim__n_component...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0
4,286.49,1.02,0.88,0.04,0.5,1,"{'model__alpha': 0.5, 'reduce_dim__n_component...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0
5,304.23,12.84,1.13,0.19,0.6,1,"{'model__alpha': 0.6, 'reduce_dim__n_component...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0
6,311.53,9.5,1.11,0.11,0.7,1,"{'model__alpha': 0.7000000000000001, 'reduce_d...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0
7,293.05,0.56,1.27,0.2,0.8,1,"{'model__alpha': 0.8, 'reduce_dim__n_component...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0
8,307.39,0.85,1.07,0.08,0.9,1,"{'model__alpha': 0.9, 'reduce_dim__n_component...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0
9,220.47,59.04,0.69,0.25,1.0,1,"{'model__alpha': 1.0, 'reduce_dim__n_component...",0.44,0.45,0.44,...,0.45,0.44,0.44,0.01,1,0.97,0.98,0.98,0.98,0.0


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

def classifier_test(df, feature, target, classifier):
    print("_"*54)
    print(f'Classifier: {classifier}')
    X = df[f'{feature}']
    y = df[f'{target}']

    td = TfidfVectorizer(max_features = df.shape[0])
    X = td.fit_transform(X).toarray()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    clf = classifier
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print(classification_report(y_test, pred))
    print("_"*54)

In [None]:
classifier_test(train, 'text_clean', 'target', GaussianNB())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer  = TfidfVectorizer(min_df=1,use_idf= True, stop_words = 'english')
dtm = vectorizer.fit_transform(train['text_clean'])

pd.DataFrame(dtm.toarray(),index=train,columns=vectorizer.get_feature_names()).head(5)

In [None]:
# Fit LSA. Use algorithm = “randomized” for large datasets
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(2, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

pd.DataFrame(lsa.components_,index = ["component_1","component_2"],columns =
vectorizer.get_feature_names())

In [None]:
pd.DataFrame(dtm_lsa, index = train, columns = ["component_1","component_2"]).head(5)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC 

from sklearn.metrics import classification_report

X = train.text_clean
y = train.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = text_clf = Pipeline(
    [
        ('tfidf', TfidfTransformer()),
        ('standardscaler', StandardScaler()),
        ('svc', SVC(gamma='auto'))
    ]
)

text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
classification_report(y_test, predicted)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
corpus = train.text_clean.to_list()

vectorizer = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_transformer = vectorizer.fit_transform(corpus)
print(tfidf_transformer.shape)

target = train.target.values.reshape(-1,1)
target

In [None]:
%%time
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(metrics.classification_report(y_test, predicted))

In [None]:
X_train