In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics 
import re
import nltk
from collections import Counter
import gensim
import heapq
from operator import itemgetter
from multiprocessing import Pool
from collections import Counter
from nltk.tokenize import RegexpTokenizer,word_tokenize
from sklearn.ensemble import GradientBoostingClassifier
from nltk.stem import  SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC
from collections import defaultdict  # For word frequency


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
df_raw = test_df.copy()

In [5]:
def remove_puncts(x):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', x)
    return text

In [6]:
def stem_text(text):
    tokenizer = ToktokTokenizer()
    stemmer = SnowballStemmer('english')
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

In [7]:
def lemma_text(text):
    tokenizer = ToktokTokenizer()
    lemmatizer = WordNetLemmatizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [8]:
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [9]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [10]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [11]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [12]:
def clean_sentence(x):
    x = x.lower()
    x = remove_stopwords(x)
    x = lemma_text(x)
    x= stem_text(x)
    x= remove_emoji(x)
    x= remove_URL(x)
    return x

In [13]:
def cross_val_score(model,train,y):
    scores=model_selection.cross_val_score(model,train,y['target'], cv=5, scoring="f1")
    return scores.mean()
#scores = model_selection.cross_val_score(gnb, train_tfidf, train_df["target"], cv=5, scoring="f1")

In [14]:
train_df['text']=train_df['text'].apply(clean_sentence)
test_df['text']=test_df['text'].apply(clean_sentence)

In [15]:
train_df['tweet_length'] = train_df['text'].apply(lambda x: len(x))
test_df['tweet_length'] = test_df['text'].apply(lambda x: len(x))

In [16]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train_df['text']).toarray()
test_tfidf = tfidf.transform(test_df["text"]).toarray()

In [17]:
cnt_vectorizer = CountVectorizer(dtype=np.float32,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),min_df=3)


# we fit count vectorizer to get ngrams from both train and test data.
cnt_vectorizer.fit(list(train_df.text.values) + list(test_df.text.values))

xtrain_cntv =  cnt_vectorizer.transform(train_df.text.values).toarray() 
xtest_cntv = cnt_vectorizer.transform(test_df.text.values).toarray()

In [18]:
#using Tf-idf+Naive Bayes
y_train = train_df.target.values
gnb=GaussianNB()
y_pred_gnb = gnb.fit(train_tfidf,y_train).predict(test_tfidf)

In [19]:
#using naives_bayes+countvectorizer
y_train = train_df.target.values
gnb_cntv=GaussianNB()
y_pred_gnb_cntv = gnb.fit(xtrain_cntv,y_train).predict(xtest_cntv)

In [20]:
#cross validation score for naives_bayes+tf-idf
cross_val_score(gnb,train_tfidf,train_df)

0.5923275291458686

In [21]:
#cross validation score for naives_bayes+countvectorizer
cross_val_score(gnb_cntv,xtrain_cntv,train_df)

0.5793943698511692

In [22]:
lg= linear_model.LogisticRegression(solver='lbfgs',max_iter=300)
lg.fit(xtrain_cntv,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
predictions=lg.predict(xtest_cntv)

In [24]:
cross_val_score(lg,xtrain_cntv,train_df)

0.5972225641431612

In [25]:
lg.fit(train_tfidf,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
cross_val_score(lg,train_tfidf,train_df)

0.5671710047296203

In [27]:
final_predict=lg.predict(test_tfidf)

In [28]:
from nltk.tokenize import word_tokenize
train_df['tokens']=train_df['text'].apply(lambda x: word_tokenize(x))
test_df['tokens'] = test_df['text'].apply(lambda x: word_tokenize(x))

In [29]:
# # defining parameter range 
# param_grid = {'C': [0.1, 1, 10, 100, 1000],  
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
#               'kernel': ['rbf']}  
  
# grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# # fitting the model for grid search 
# grid.fit(xtrain_cntv, y_train)

In [30]:
def fn_pre_process_data(doc):
    for rec in doc:
        yield gensim.utils.simple_preprocess(rec)

corpus = list(fn_pre_process_data(train_df['text']))
corpus += list(fn_pre_process_data(test_df['text']))

In [31]:
corpus[1]

['forest', 'fire', 'near', 'la', 'rong', 'sask', 'canada']

In [32]:
from gensim.models import Word2Vec

print('initiated ...')
wv_model = Word2Vec(corpus,size=300,window=3,min_count=2)
wv_model.train(corpus,total_examples=len(corpus),epochs=10)

initiated ...


(876661, 997010)

In [33]:
# summarize vocabulary
words = list(wv_model.wv.vocab)
print(words)

# save model
wv_model.save('model.bin')
# load model
wv_model = Word2Vec.load('model.bin')
print(wv_model)

['deed', 'reason', 'earthquak', 'may', 'allah', 'forgiv', 'forest', 'fire', 'near', 'la', 'canada', 'resid', 'ask', 'shelter', 'place', 'evacu', 'order', 'expect', 'peopl', 'receiv', 'wildfir', 'california', 'got', 'sent', 'photo', 'rubi', 'alaska', 'smoke', 'pour', 'school', 'rockyfir', 'updat', 'hwy', 'close', 'direct', 'due', 'lake', 'counti', 'cafir', 'flood', 'disast', 'heavi', 'rain', 'caus', 'flash', 'street', 'colorado', 'spring', 'area', 'top', 'hill', 'see', 'wood', 'emerg', 'happen', 'build', 'across', 'afraid', 'tornado', 'come', 'three', 'die', 'heat', 'wave', 'far', 'haha', 'south', 'tampa', 'get', 'hah', 'wait', 'second', 'live', 'gonna', 'florida', 'days', 'lost', 'count', 'bago', 'myanmar', 'we', 'arriv', 'damag', 'bus', 'multi', 'car', 'crash', 'break', 'man', 'love', 'fruit', 'summer', 'fast', 'ridicul', 'london', 'cool', 'ski', 'wonder', 'day', 'way', 'eat', 'shit', 'nyc', 'last', 'week', 'girlfriend', 'like', 'pasta', 'end', 'wholesal', 'market', 'ablaz', 'alway', 

In [34]:
wv_model.most_similar('ablaze')

  """Entry point for launching an IPython kernel.


[('pickup', 0.9945023655891418),
 ('tr', 0.9927799701690674),
 ('resid', 0.9926772117614746),
 ('cal', 0.9924595355987549),
 ('drunk', 0.9923288226127625),
 ('sat', 0.9923228621482849),
 ('who', 0.9921956062316895),
 ('tanker', 0.9920181035995483),
 ('langley', 0.9918699860572815),
 ('cheyenn', 0.9918508529663086)]

In [35]:
def get_word_embeddings(token_list,vector,k=300):
    if len(token_list) < 1:
        return np.zeros(k)
    else:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in token_list] 
    
    sum = np.sum(vectorized,axis=0)
    ## return the average
    return sum/len(vectorized)        
def get_embeddings(tokens,vector):
        embeddings = tokens.apply(lambda x: get_word_embeddings(x, wv_model))
        return list(embeddings)

train_embeddings = get_embeddings(train_df['tokens'],wv_model)
test_embeddings = get_embeddings(test_df['tokens'],wv_model)

  """
  """


In [36]:
lg= linear_model.LogisticRegression(solver='lbfgs',max_iter=300)
lg.fit(train_embeddings,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
predictions=lg.predict(test_embeddings)

In [38]:
cross_val_score(lg,train_embeddings,train_df)

0.6124780907711951

In [39]:
submission = pd.DataFrame({
        "id": df_raw["id"],
        "target":final_predict
    })

submission.to_csv('tweet_submission_file.csv', index=False)