In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics 
import re
import nltk
from collections import Counter
import gensim
import heapq
from operator import itemgetter
from multiprocessing import Pool
from collections import Counter
from nltk.tokenize import RegexpTokenizer,word_tokenize
from sklearn.ensemble import GradientBoostingClassifier
from nltk.stem import  SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC
from collections import defaultdict  # For word frequency


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [22]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [3]:
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
df_raw = test_df.copy()

In [5]:
def remove_puncts(x):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', x)
    return text

In [6]:
def stem_text(text):
    tokenizer = ToktokTokenizer()
    stemmer = SnowballStemmer('english')
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

In [7]:
def lemma_text(text):
    tokenizer = ToktokTokenizer()
    lemmatizer = WordNetLemmatizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [8]:
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [9]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [10]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [11]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [23]:
def clean_sentence(x):
    x = x.lower()
    x = remove_stopwords(x)
    x = lemma_text(x)
    x= stem_text(x)
    x= remove_emoji(x)
    x= remove_URL(x)
    x = remove_puncts(x)
    return x

In [21]:
def cross_val_score(model,train,y):
    scores=model_selection.cross_val_score(model,train,y['target'], cv=5, scoring="f1")
    return scores.mean()
#scores = model_selection.cross_val_score(gnb, train_tfidf, train_df["target"], cv=5, scoring="f1")

In [24]:
train_df['text']=train_df['text'].apply(clean_sentence)
test_df['text']=test_df['text'].apply(clean_sentence)

In [25]:
train_df['tweet_length'] = train_df['text'].apply(lambda x: len(x))
test_df['tweet_length'] = test_df['text'].apply(lambda x: len(x))

In [26]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train_df['text']).toarray()
test_tfidf = tfidf.transform(test_df["text"]).toarray()

In [27]:
cnt_vectorizer = CountVectorizer(dtype=np.float32,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),min_df=3)


# we fit count vectorizer to get ngrams from both train and test data.
cnt_vectorizer.fit(list(train_df.text.values) + list(test_df.text.values))

xtrain_cntv =  cnt_vectorizer.transform(train_df.text.values).toarray() 
xtest_cntv = cnt_vectorizer.transform(test_df.text.values).toarray()

In [28]:
#using Tf-idf+Naive Bayes
y_train = train_df.target.values
gnb=GaussianNB()
y_pred_gnb = gnb.fit(train_tfidf,y_train).predict(test_tfidf)

In [30]:
train_df['text']

'forest fire near la rong sask canada'

In [31]:
def to_list(x):
    return x.split(' ')

In [33]:
train_df['text_tokenized'] = list(map(lambda x: to_list(x), train_df['text']))
test_df['text_tokenized'] = list(map(lambda x: to_list(x), test_df['text']))

In [35]:
dictionary = gensim.corpora.Dictionary(train_df['text_tokenized'])

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 allah
1 deed
2 earthquak
3 forgiv
4 may
5 reason
6 u
7 canada
8 fire
9 forest
10 la


In [36]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [38]:
bow_corpus = [dictionary.doc2bow(doc) for doc in train_df['text_tokenized']]
bow_corpus[4310]

[(112, 1),
 (221, 1),
 (280, 1),
 (345, 1),
 (433, 1),
 (485, 1),
 (540, 1),
 (683, 1),
 (930, 1)]

In [41]:
bow_doc_4310 = bow_corpus[4310] 
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 112 ("want") appears 1 time.
Word 221 ("even") appears 1 time.
Word 280 ("think") appears 1 time.
Word 345 ("lead") appears 1 time.
Word 433 ("mention") appears 1 time.
Word 485 ("let") appears 1 time.
Word 540 ("anyth") appears 1 time.
Word 683 ("dont") appears 1 time.
Word 930 ("hellfir") appears 1 time.


In [42]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5403152530305637),
 (1, 0.47005197173401536),
 (2, 0.5770325409061505),
 (3, 0.39261179038189137)]


In [118]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.010*"fire" + 0.009*"death" + 0.009*"first" + 0.008*"smoke" + 0.006*"help" + 0.006*"road" + 0.006*"wound" + 0.006*"crash" + 0.005*"hiroshima" + 0.005*"get"
Topic: 1 Word: 0.009*"train" + 0.009*"derail" + 0.007*"emerg" + 0.007*"new" + 0.006*"danger" + 0.006*"wildfir" + 0.006*"annihil" + 0.006*"flood" + 0.006*"servic" + 0.006*"collid"
Topic: 2 Word: 0.009*"emerg" + 0.008*"get" + 0.008*"storm" + 0.007*"man" + 0.007*"via" + 0.006*"fatal" + 0.006*"woman" + 0.006*"flood" + 0.006*"famili" + 0.006*"cross"
Topic: 3 Word: 0.019*"like" + 0.014*"124" + 0.009*"wreck" + 0.009*"look" + 0.008*"bomb" + 0.007*"new" + 0.007*"kill" + 0.007*"deton" + 0.007*"amp" + 0.007*"feel"
Topic: 4 Word: 0.009*"crush" + 0.008*"murder" + 0.008*"come" + 0.007*"see" + 0.007*"mass" + 0.007*"hostag" + 0.007*"love" + 0.007*"think" + 0.007*"plan" + 0.006*"good"
Topic: 5 Word: 0.010*"amp" + 0.008*"accid" + 0.007*"thunder" + 0.007*"collis" + 0.007*"know" + 0.007*"injur" + 0.006*"2" + 0.006*"two" + 0.006*"make" +

In [60]:
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

In [119]:
topic_dist = [lda_model_tfidf.get_document_topics(doc) for doc in bow_corpus]

In [120]:
topic_weights = []
for t in topic_dist:
    wt = []
    for _,probs in t:
        wt.append(probs)
    topic_weights.append(wt)

In [121]:
arr = pd.DataFrame(topic_weights).fillna(0).values

In [122]:
arr

array([[0.02000371, 0.02000345, 0.02000219, ..., 0.02000839, 0.02000265,
        0.02000266],
       [0.81995118, 0.02000515, 0.02000247, ..., 0.02000701, 0.0200025 ,
        0.0200119 ],
       [0.01428891, 0.01429275, 0.63854665, ..., 0.01429121, 0.01429023,
        0.01428781],
       ...,
       [0.02000294, 0.02000275, 0.02000216, ..., 0.02000147, 0.0200026 ,
        0.02000354],
       [0.01250585, 0.88745302, 0.01250445, ..., 0.01250341, 0.01250393,
        0.01250706],
       [0.01111256, 0.89998645, 0.01111288, ..., 0.01111196, 0.01111266,
        0.01111197]])

In [123]:
actual_topics= list(train_df['target'])

In [124]:
import matplotlib.colors as mcolors
# arr = arr[np.amax(arr, axis=1) > 0.35]
topic_num = np.argmax(arr, axis=1)
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 7613 samples in 0.064s...
[t-SNE] Computed neighbors for 7613 samples in 0.693s...
[t-SNE] Computed conditional probabilities for sample 1000 / 7613
[t-SNE] Computed conditional probabilities for sample 2000 / 7613
[t-SNE] Computed conditional probabilities for sample 3000 / 7613
[t-SNE] Computed conditional probabilities for sample 4000 / 7613
[t-SNE] Computed conditional probabilities for sample 5000 / 7613
[t-SNE] Computed conditional probabilities for sample 6000 / 7613
[t-SNE] Computed conditional probabilities for sample 7000 / 7613
[t-SNE] Computed conditional probabilities for sample 7613 / 7613
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 58.724106
[t-SNE] KL divergence after 1000 iterations: 0.424108


In [127]:
output_notebook()
n_topics = 5
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)