In [1]:
import gensim
from gensim import corpora
import multiprocessing
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import pkg_resources
import pandas as pd
import numpy as np

In [2]:
import pandas as pd
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.parsing.preprocessing import preprocess_string
import random
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [1045]:
df = pd.read_csv('tweets_processed.csv', usecols=['tweets', 'stems', 'clean_keyword', 'clean_location'])

In [1046]:
target_df = pd.read_csv('train.csv', usecols=['target'])
target = list(target_df.target)

In [1048]:
train_df, test_df = df.iloc[:7613, :], df.iloc[7613:, :] 
train_df['target'] = target
train_df.drop_duplicates(subset=['tweets'], keep='last', inplace=True)

In [1049]:
def kw_loc_column_fix(df):
    df['clean_keyword'] = df.clean_keyword.astype(str)
    df['clean_keyword'] = df.clean_keyword.str.lower()
    df['clean_keyword'] = df.clean_keyword.str.replace(r'\bnan\b', r'')
    df['clean_keyword'] = df.clean_keyword.str.replace(r' ', r'_')
    df['clean_keyword'] = df.clean_keyword.str.strip()
    df['clean_location'] = df.clean_location.astype(str)
    df['clean_location'] = df.clean_location.str.lower()
    df['clean_location'] = df.clean_location.str.strip()
    df['clean_location'] = df.clean_location.str.replace(r'\bnan\b', r'')
    df['clean_location'] = df.clean_location.str.replace(r' ', r'_')
    return df

In [1050]:
train_df = kw_loc_column_fix(train_df)
test_df = kw_loc_column_fix(test_df)

In [1051]:
keyword = pd.get_dummies(train_df.clean_keyword, prefix='keyword')
location = pd.get_dummies(train_df.clean_location, prefix='location')
train_df_secondary = pd.concat([keyword, location], axis=1)

keyword_test = pd.get_dummies(test_df.clean_keyword, prefix='keyword')
location_test = pd.get_dummies(test_df.clean_location, prefix='location')
test_df_secondary = pd.concat([keyword_test, location_test], axis=1)

## extract bigrams

In [362]:
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from nltk.corpus import stopwords

In [1042]:
texts = df.tweets.astype(str)

def get_documents(text):
    docs = [row.split() for row in text]
    return docs

doc = get_documents(texts)

def sentence_to_bi_grams(sentence):
    return ' '.join(phrases[sentence])

phrases = Phrases(doc, min_count=10, threshold=.5, connector_words=ENGLISH_CONNECTOR_WORDS)

frozen_phrases = phrases.freeze()

bigrams = []
for row in doc: 
    parsed_sentence = sentence_to_bi_grams(row)
    bigrams.append(parsed_sentence)
    
df['bigrams'] = bigrams

In [1043]:
bigrams

['our deeds are the reason of this earthquake may allah forgive us all',
 'forest fire near la ronge sask canada',
 'all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected',
 'people receive wildfires evacuation orders in california',
 'just_got sent this photo from ruby alaska as smoke from wildfires pours into a school',
 'rocky_fire update california highway closed in both directions due to lake county fire c afire wildfires',
 'flood disaster heavy_rain causes flash flooding of streets in manitou colorado springs areas',
 'i_am on top of the hill and i_can see a fire in the woods',
 'there_is an emergency evacuation happening now in the building across the street',
 'i_am afraid that the tornado is_coming to our area',
 'three people died from the heat_wave so_far',
 'haha south tampa is getting flooded hah wait a second i live in south tampa what am i gonna do what am i gonna do fuck flooding',
 'raining f

## prep text for vectorization

In [445]:
import re

import nltk
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if word not in stopword_list and len(word) > 1:
                tokens.append(word.lower())
    return tokens

train_tweets = train_df.stems.astype(str).apply(tokenize_text)
test_tweets = test_df.stems.apply(tokenize_text)

In [1052]:
print(len(train_df))
print(len(test_df))

6830
3263


In [1053]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word)>1:
                tokens.append(word.lower())
    return tokens

texts = train_df.tweets.astype(str).apply(tokenize_text)
texts

0       [our, deeds, are, the, reason, of, this, earth...
1           [forest, fire, near, la, ronge, sask, canada]
2       [all, residents, asked, to, shelter, in, place...
3       [people, receive, wildfires, evacuation, order...
4       [just, got, sent, this, photo, from, ruby, ala...
                              ...                        
7608    [two, giant, cranes, holding, bridge, collapse...
7609    [the, out, of, control, wild, fires, in, calif...
7610                       [utc, km, of, volcano, hawaii]
7611    [police, investigating, after, an, bike, colli...
7612    [the, latest, more, homes, razed, by, northern...
Name: tweets, Length: 6830, dtype: object

In [1055]:
X_train, X_test, y_train, y_test = train_test_split(texts, train_df.target, test_size=0.1, random_state=42)

In [1056]:
from collections import Counter
from statistics import mean

word_dict = Counter(X_train.sum())
# count number of words in corpus
num_words = sum(word_dict[w] for w in word_dict)
print(f'There are {num_words} words in the training dataset')

# count number of unique words in corpus
word_count_sorted = [(value, key) for key, value in word_dict.items()]
word_count_sorted.sort()
vocab_size = len(word_count_sorted)
print(f'There are {vocab_size} unique words in the training dataset')
print("max len of training tweets",max([len(x) for x in X_train]))
print("max len of test tweets",max([len(x) for x in X_test]))
print("min len of training tweets",min([len(x) for x in X_train]))
print("min len of test tweets",min([len(x) for x in X_test]))
print("mean len of training tweets", mean([len(x) for x in X_train]))
print("mean len of test tweets",mean([len(x) for x in X_test]))

There are 82565 words in the training dataset
There are 12860 unique words in the training dataset
max len of training tweets 30
max len of test tweets 31
min len of training tweets 1
min len of test tweets 1
mean len of training tweets 13.431755327802179
mean len of test tweets 13.1303074670571


In [1059]:
def get_embedding(word, model, vector_size):
    if word in model.wv.vocab:
        return model[x]
    else: 
        return np.zeros(vector_size)

def get_average_vector(model, list_of_tweets):    
    vec_dicts = {}
    for tweet in list_of_tweets:
        average_vector = (np.mean(np.array([get_embedding(tweet, model)]), axis=0))
        v_dict = {tweet : (average_vector)}
        vec_dicts.update(v_dict)
        return vec_dicts

In [778]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np     
import plotly

In [779]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=42)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly



## uninitialized skipgram

In [780]:
cores = multiprocessing.cpu_count()

In [1081]:
# uninitializeed model
model_ui_sg = Word2Vec(
        vector_size=50,
        sg=1,
        min_count=4,
        window=2,
        workers=cores)
vocab = model_ui_sg.build_vocab(X_train)

model_ui_sg.train(X_train, total_examples=len(X_train), total_words=num_words, compute_loss=True, epochs=5)

(278755, 412825)

In [1082]:
model_ui_sg.wv.most_similar(positive="earthquake")

[('officers', 0.9969140291213989),
 ('river', 0.9964267611503601),
 ('inside', 0.9963862895965576),
 ('israeli', 0.9963551163673401),
 ('office', 0.9962903261184692),
 ('level', 0.996052622795105),
 ('aug', 0.9960083365440369),
 ('carrying', 0.9959856867790222),
 ('south', 0.9959832429885864),
 ('syrian', 0.9959648847579956)]

In [1083]:
x_vals, y_vals, labels = reduce_dimensions(model_ui_sg)
plot_function(x_vals, y_vals, labels)

## self initialized skipgram

In [1084]:
# uninitializeed model
model_si_sg = Word2Vec(
        sentences=X_train,
        vector_size=50,
        sg=0,
        min_count=4,
        window=2,
        workers=cores)
vocab = model_si_sg.build_vocab(X_train)

model_si_sg.train(X_train, total_examples=len(X_train), total_words=num_words, compute_loss=True, epochs=10)

(557278, 825650)

In [1085]:
model_si_sg.wv.most_similar(positive="earthquake")

[('red', 0.9972033500671387),
 ('washington', 0.9970664978027344),
 ('vehicle', 0.9969689249992371),
 ('seismic', 0.9969536066055298),
 ('lake', 0.9967263340950012),
 ('flag', 0.9966813325881958),
 ('east', 0.9966281652450562),
 ('security', 0.9966222643852234),
 ('experts', 0.9966040253639221),
 ('abc', 0.9965810775756836)]

In [1086]:
x_vals, y_vals, labels = reduce_dimensions(model_si_sg)
plot_function(x_vals, y_vals, labels)

# cbow

In [1087]:
# uninitializeed model
model_ui_bow = Word2Vec(
        vector_size=50,
        sg=0,
        negative=5,
        ns_exponent=0.3,
        cbow_mean=1,
        #sample=1e-3, 
        min_count=4,
        window=2, 
        workers=cores)
vocab = model_ui_bow.build_vocab(X_train)

model_ui_bow.train(X_train, total_examples=model_ui_bow.corpus_count, epochs=10)

(556971, 825650)

In [1088]:
model_ui_bow.wv.most_similar(positive="wildfire")

[('northern', 0.9974344372749329),
 ('into', 0.99722820520401),
 ('times', 0.9972079992294312),
 ('via', 0.9971858859062195),
 ('world', 0.9970776438713074),
 ('accident', 0.9969075918197632),
 ('homes', 0.9968956112861633),
 ('eyewitness', 0.9968301653862),
 ('police', 0.9967935085296631),
 ('top', 0.9967880845069885)]

In [1089]:
model_ui_bow.get_latest_training_loss()

0.0

In [1090]:
x_vals, y_vals, labels = reduce_dimensions(model_ui_bow)
plot_function(x_vals, y_vals, labels)

In [546]:
##### model.save("w2v_ui.model")
model = Word2Vec.load("w2v_ui.model")
word_vectors = model_ui.wv
word_vectors.save("w2v_ui.wordvectors")

NameError: name 'model_ui' is not defined

In [834]:
# self initialized model
model_si_bow = Word2Vec(
        X_train,
        vector_size=50, 
        #sample=1e-5,
        min_count=5,
        window=2, 
        workers=cores)
vocab = model_si_bow.build_vocab(X_train)

model_si_bow.train(X_train,  total_examples=len(X_train), total_words=num_words, epochs=10)

(305552, 455080)

In [835]:
model_si_bow.wv.most_similar(positive="wildfire")

[('fires', 0.9944382309913635),
 ('northern', 0.9940565228462219),
 ('mariana', 0.9935768842697144),
 ('abc', 0.9934718012809753),
 ('into', 0.9931449294090271),
 ('declaration', 0.9930047988891602),
 ('read', 0.992928683757782),
 ('automatic', 0.9927445650100708),
 ('repatriated', 0.99263596534729),
 ('county', 0.9925776720046997)]

In [836]:
x_vals, y_vals, labels = reduce_dimensions(model_si_bow)
plot_function(x_vals, y_vals, labels)

## pretrained word_vectors model

In [1126]:
from nltk.tokenize import word_tokenize
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [1132]:

# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [1133]:
token = Tokenizer()
token.fit_on_texts(texts)

X = token.texts_to_sequences(texts)
print(X[:5])

[[99, 4021, 14, 1, 707, 4, 17, 238, 138, 1671, 4022, 61, 36], [171, 39, 182, 1259, 6007, 6008, 894], [36, 1499, 1260, 2, 1672, 3, 579, 14, 123, 6009, 19, 1500, 38, 298, 228, 58, 1672, 3, 579, 1173, 14, 937], [52, 4023, 1084, 228, 1173, 3, 86], [28, 73, 1005, 17, 268, 24, 4024, 1261, 34, 208, 24, 1084, 6010, 67, 161]]


In [1159]:
c = token.texts_to_matrix(texts, mode='tfidf')

In [1164]:
c[0].shape

(13618,)

In [1161]:
token.word_index

{'the': 1,
 'to': 2,
 'in': 3,
 'of': 4,
 'is': 5,
 'and': 6,
 'you': 7,
 'it': 8,
 'not': 9,
 'on': 10,
 'for': 11,
 'my': 12,
 'that': 13,
 'are': 14,
 'with': 15,
 'at': 16,
 'this': 17,
 'have': 18,
 'by': 19,
 'be': 20,
 'was': 21,
 'am': 22,
 'do': 23,
 'from': 24,
 'out': 25,
 'like': 26,
 'will': 27,
 'just': 28,
 'but': 29,
 'me': 30,
 'so': 31,
 'your': 32,
 'up': 33,
 'as': 34,
 'we': 35,
 'all': 36,
 'can': 37,
 'no': 38,
 'fire': 39,
 'has': 40,
 'when': 41,
 'an': 42,
 'what': 43,
 'they': 44,
 'he': 45,
 'if': 46,
 'new': 47,
 'now': 48,
 'news': 49,
 'after': 50,
 'get': 51,
 'people': 52,
 'one': 53,
 'about': 54,
 'there': 55,
 'how': 56,
 'via': 57,
 'or': 58,
 'more': 59,
 'would': 60,
 'us': 61,
 'been': 62,
 'over': 63,
 'emergency': 64,
 'video': 65,
 'who': 66,
 'into': 67,
 'police': 68,
 'because': 69,
 'her': 70,
 'time': 71,
 'his': 72,
 'got': 73,
 'had': 74,
 'disaster': 75,
 'back': 76,
 'day': 77,
 'some': 78,
 'body': 79,
 'know': 80,
 'storm': 81,
 'of

In [1143]:
vocab_size = len(token.word_index)
embeddings_dict = token.word_index

In [1171]:
len(c)

6830

In [1136]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size+1, 100))

for word, i in token.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [1144]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_index.keys(), key=lambda word: spatial.distance.euclidean(embeddings_index[word], embedding))

In [1145]:
find_closest_embeddings(embeddings_index["earthquake"])

['earthquake',
 'quake',
 'tsunami',
 'temblor',
 'magnitude',
 'disaster',
 'aftershock',
 'earthquakes',
 'tremor',
 'quakes',
 'tsunamis',
 'jolted',
 'devastated',
 'devastating',
 'epicenter',
 'aftershocks',
 'disasters',
 'sumatra',
 'devastation',
 'richter',
 'floods',
 'struck',
 'tremors',
 'katrina',
 'flood',
 'flooding',
 'storm',
 'damage',
 'undersea',
 'measuring',
 'mudslide',
 'cyclone',
 'rocked',
 'aftermath',
 'jolts',
 'rattled',
 'magnitude-6',
 'catastrophe',
 'affected',
 'sichuan',
 'scale',
 'stricken',
 'explosion',
 'typhoon',
 'jolt',
 'calamity',
 'triggered',
 'epicentre',
 'blast',
 'ravaged',
 'wenchuan',
 'tragedy',
 'catastrophic',
 'shook',
 'storms',
 'occurred',
 'morakot',
 '7.6',
 'waves',
 'seismologists',
 '6.8',
 'seismic',
 '7.2',
 '7.2-magnitude',
 'hurricane',
 'magnitude-5',
 'tidal',
 'magnitude-7',
 'damaged',
 'tornado',
 '6.2',
 'spill',
 'temblors',
 'massive',
 'tangshan',
 '7.6-magnitude',
 'rainstorm',
 'landslides',
 'rattles',


In [1147]:

tsne = TSNE(n_components=2, random_state=0)
words =  list(embeddings_dict.values())
vectors = [embeddings_dict[word] for word in words]
Y = tsne.fit_transform(vectors[:1000])
plt.scatter(Y[:, 0], Y[:, 1])

for label, x, y in zip(words, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords="offset points")
plt.show()

KeyError: 1

In [1140]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [1141]:
# pretrained model
model_pt = Word2Vec(glove_vectors,
                     window=3,
                     vector_size=50,
                     alpha=0.1, 
                     min_alpha=0.001, 
                     workers=cores-1)
vocab = model_pt.build_vocab(texts)

model_pt.train(X_train, total_examples=len(X_train), epochs=10)

NameError: name 'glove_vectors' is not defined

In [None]:
model_pt.wv.most_similar(positive="earthquake")

pretrained_w2v_model.save("w2v_pt.model")
model_pt = Word2Vec.load("w2v_pt.model")
word_vectors_pt = model_pt.wv
word_vectors_pt.save("w2v_pt_wv.wordvectors")

In [None]:
model_pt.train(train_X, total_examples=pretrained_w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
wv = KeyedVectors.load("w2v_pt_wv.wordvectors", mmap='r')

In [None]:
wv

In [1166]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

In [1167]:

# the models that you want to compare
models = {

    'clf_knn': KNeighborsClassifier(),
    'clf_svm': SVC(),
    #'clf_xgb': XGBClassifier(),
    #'clf_ada': AdaBoostClassifier(),
 
}

# the optimisation parameters for each of the above models
params = {
#     'clf_randforest':{ 
#         'n_jobs': [-1],
#         'max_features': ['sqrt'],
#         'criterion': ['entropy'],
#         'penalty': [10, 25, 36, 64],
       
#          },
    
    'clf_knn': {
        'weights': [ 'distance'],
        'algorithm': ['ball_tree', 'kd_tree']
         },
    

    

    'clf_svm' : {
        'kernel': ['rbf', 'sigmoid'], 
     
        
         'C': [.01, .1, 1, 10, 100, 1000],
          },
    
#     'clf_xgb' : {
#         'objective': ['binary:logistic'],
#         'learning_rate': [0.001, 0.05, 0.1],
#         'alpha': [0.001, 0.3, 0.05]
#         },
        

                   
                 }

In [None]:
def fit_search(X_data, y_data):
        """
        fits the list of models to the training data, thereby obtaining in each 
        case an evaluation score after GridSearchCV cross-validation
        """
        for name in models.keys():
            est = models[name]
            est_params = params[name]
            gscv = GridSearchCV(estimator=est, param_grid=est_params, cv=10)
            gscv.fit(X_data, y_data)
            print("best parameters are: {}".format(gscv.best_estimator_))
            y_pred = gscv.predict(X_data)
            print(classification_report(y_data, y_pred))

In [None]:

import warnings
warnings.filterwarnings('ignore') 
fit_search(X_train, y_train)

In [None]:
logreg = LogisticRegression(n_jobs=1, C=1e3)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [None]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [None]:
tweets_df

## train, test = train_test_split(train_df, random_state=42, test_size=0.2)

In [1099]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from collections import Counter

In [1092]:
# verify shape of data
print('Test label shape:', np.shape(y_test))
print('Train label shape:', np.shape(y_train))

print('Test shape:', np.shape(X_test))
print('Train shape:', np.shape(X_train))

Test label shape: (683,)
Train label shape: (6147,)
Test shape: (683,)
Train shape: (6147,)


In [1120]:
from nltk.corpus import stopwords
def tokenize_text_sw(text):
    tokens = []
    stopword_list =  list(stopwords.words('english'))
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word)>1 :
                tokens.append(word.lower())
    return tokens

texts = train_df.tweets.astype(str).apply(tokenize_text)
texts

0       [our, deeds, are, the, reason, of, this, earth...
1           [forest, fire, near, la, ronge, sask, canada]
2       [all, residents, asked, to, shelter, in, place...
3       [people, receive, wildfires, evacuation, order...
4       [just, got, sent, this, photo, from, ruby, ala...
                              ...                        
7608    [two, giant, cranes, holding, bridge, collapse...
7609    [the, out, of, control, wild, fires, in, calif...
7610                       [utc, km, of, volcano, hawaii]
7611    [police, investigating, after, an, bike, colli...
7612    [the, latest, more, homes, razed, by, northern...
Name: tweets, Length: 6830, dtype: object

In [1121]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

In [1125]:
train_X = str(texts)
train_y = X_test
# convert target data with labelencoder
train_y= train_y.astype('category')
le = LabelEncoder()   
y = np.array(train_y)

le.fit_transform(y)
y = y.ravel()
# define independent variable
X = train_X
#X = X.reshape(-1,1)
# split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                     test_size=.1, stratify=y)


TypeError: unhashable type: 'list'

In [1123]:
#(ngram_range=(1,2)
pipeline = Pipeline([ ('vect', CountVectorizer(ngram_range=(1,2))), 
                     ('tfidf', TfidfTransformer(use_idf=True))])
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)



counter = Counter(y_train)
print(counter)

#assign variable to np.shape tuple, width dimension
input_dimension = np.shape(X_train)
count_terms = input_dimension[1]
print(count_terms)

AttributeError: 'list' object has no attribute 'lower'

In [1151]:
vocab_size = 10000  # Only consider the top 20k words
num_tokens_per_example = 15  # Only consider the first 200 words of each movie review
X_train = keras.preprocessing.sequence.pad_sequences(
    X_train, maxlen=num_tokens_per_example
)
x_val = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=num_tokens_per_example) 

ValueError: invalid literal for int() with base 10: 'coming'

In [1150]:
from tensorflow import keras
