In [1]:
import gensim
from gensim import corpora
import multiprocessing
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import pkg_resources
import pandas as pd
import numpy as np

In [2]:
import pandas as pd
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.parsing.preprocessing import preprocess_string
import random
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [6]:
df = pd.read_csv('tweets_processed.csv', usecols=['tokens', 'lem_', 'clean_keyword', 'clean_location'])

In [7]:
target_df = pd.read_csv('train.csv', usecols=['target'])
target = list(target_df.target)

In [8]:
train_df, test_df = df.iloc[:7613, :], df.iloc[7613:, :] 

In [9]:
def kw_loc_column_fix(df):
    df['clean_keyword'] = df.clean_keyword.astype(str)
    df['clean_keyword'] = df.clean_keyword.str.lower()
    df['clean_keyword'] = df.clean_keyword.str.replace(r'\bnan\b', r'')
    df['clean_keyword'] = df.clean_keyword.str.replace(r' ', r'_')
    df['clean_keyword'] = df.clean_keyword.str.strip()
    df['clean_location'] = df.clean_location.astype(str)
    df['clean_location'] = df.clean_location.str.lower()
    df['clean_location'] = df.clean_location.str.strip()
    df['clean_location'] = df.clean_location.str.replace(r'\bnan\b', r'')
    df['clean_location'] = df.clean_location.str.replace(r' ', r'_')
    return df

In [10]:
train_df = kw_loc_column_fix(train_df)
test_df = kw_loc_column_fix(test_df)

In [11]:
keyword = pd.get_dummies(train_df.clean_keyword, prefix='keyword')
location = pd.get_dummies(train_df.clean_location, prefix='location')
train_df_secondary = pd.concat([keyword, location], axis=1)

keyword_test = pd.get_dummies(test_df.clean_keyword, prefix='keyword')
location_test = pd.get_dummies(test_df.clean_location, prefix='location')
test_df_secondary = pd.concat([keyword_test, location_test], axis=1)

## prep text for vectorization

In [12]:
import re

import nltk
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if word not in stopword_list and len(word) > 1:
                tokens.append(word.lower())
    return tokens

train_tweets = train_df.stems.astype(str).apply(tokenize_text)
test_tweets = test_df.stems.apply(tokenize_text)

In [13]:
print(len(train_df))
print(len(test_df))

7613
3263


In [58]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            tokens.append(word.lower())
    return tokens

texts = train_df.lem_.astype(str).apply(tokenize_text)
texts

0       [deed, reason, earthquake, may, allah, forgive...
1           [forest, fire, near, la, ronge, sask, canada]
2       [resident, ask, shelter, place, notify, office...
3       [people, receive, wildfire, evacuation, order,...
4       [get, sent, photo, ruby, alaska, smoke, wildfi...
                              ...                        
7608    [two, giant, crane, hold, bridge, collapse, ne...
7609    [control, wild, fire, california, even, northe...
7610                           [utc, km, volcano, hawaii]
7611    [police, investigate, bike, collide, car, litt...
7612    [late, home, raze, northern, california, wildf...
Name: lem_, Length: 7613, dtype: object

In [59]:
def tokenize_tweet(text):
    sent_tokens = []
    for sent in nltk.sent_tokenize(text):
        sent_tokens.append(sent.lower())
    return sent_tokens

In [60]:
X_train, X_test, y_train, y_test = train_test_split(texts, target, test_size=0.1, random_state=42)

In [80]:
from collections import Counter

word_dict = Counter(X_train.sum())
# count number of words in corpus
num_words = sum(word_dict[w] for w in word_dict)
print(f'There are {num_words} words in the training dataset')

# count number of unique words in corpus
word_count_sorted = [(value, key) for key, value in word_dict.items()]
word_count_sorted.sort()
vocab_size = len(word_count_sorted)
print(f'There are {vocab_size} unique words in the training dataset')
print("max len of training tweets",max([len(x) for x in X_train]))
print("max len of test tweets",max([len(x) for x in X_test]))

There are 59560 words in the training dataset
There are 10835 unique words in the training dataset
max len of training tweets 24
max len of test tweets 18


In [81]:
def get_embedding(word, model, vector_size):
    if word in model.wv.vocab:
        return model[x]
    else: 
        return np.zeros(vector_size)

def get_average_vector(model, list_of_tweets):    
    vec_dicts = {}
    for tweet in list_of_tweets:
        average_vector = (np.mean(np.array([get_embedding(tweet, model)]), axis=0))
        v_dict = {tweet : (average_vector)}
        vec_dicts.update(v_dict)
        return vec_dicts

In [82]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np     
import plotly

In [83]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=42)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly



## uninitialized skipgram

In [84]:
cores = multiprocessing.cpu_count()

In [154]:
# uninitializeed model
model_ui_sg = Word2Vec(
        vector_size=50,
        sg=1,
        #alpha=0.00075,
        #min_count=5,
        sample=1e-3, 
        window=4, 
        workers=cores)
vocab = model_ui_sg.build_vocab(X_train)

model_ui_sg.train(X_train, total_examples=len(X_train), total_words=num_words, compute_loss=True, epochs=60)

(2704258, 3573600)

In [155]:
model_ui_sg.wv.most_similar(positive="earthquake")

[('volcano', 0.6679810285568237),
 ('sismo', 0.6367193460464478),
 ('utc', 0.6236450672149658),
 ('usgs', 0.6133530735969543),
 ('km', 0.603156328201294),
 ('quake', 0.590907633304596),
 ('upgrade', 0.585986316204071),
 ('hawaii', 0.5852249264717102),
 ('utah', 0.5845253467559814),
 ('anchorage', 0.5789162516593933)]

In [156]:
x_vals, y_vals, labels = reduce_dimensions(model_ui_sg)
plot_function(x_vals, y_vals, labels)

## self initialized skipgram

In [157]:
# uninitializeed model
model_si_sg = Word2Vec(
        sentences=X_train,
        vector_size=50,
        sg=1,
        min_count=4,
        alpha=1e-8,
        window=3,
        workers=cores)
vocab = model_si_sg.build_vocab(X_train)

model_si_sg.train(X_train, total_examples=len(X_train), total_words=num_words, compute_loss=True, epochs=250)

(11790685, 14890000)

In [158]:
model_si_sg.wv.most_similar(positive="earthquake")

[('new', 0.9743199348449707),
 ('train', 0.9716335535049438),
 ('world', 0.9704384803771973),
 ('building', 0.9688072204589844),
 ('like', 0.9685285091400146),
 ('damage', 0.9678304195404053),
 ('watch', 0.9671491980552673),
 ('would', 0.9665688872337341),
 ('time', 0.9664174914360046),
 ('life', 0.9663577079772949)]

In [159]:
x_vals, y_vals, labels = reduce_dimensions(model_si_sg)
plot_function(x_vals, y_vals, labels)

### cbow

In [191]:
# uninitializeed model
model_ui_bow = Word2Vec(
        vector_size=50,
        sg=0,
        #cbow_mean=1,
        #alpha=1e-12,
        sample=1e-5, 
        window=3, 
        workers=cores)
vocab = model_ui_bow.build_vocab(X_train)

model_ui_bow.train(X_train, total_examples=len(X_train), total_words=num_words, epochs=300)

(2144958, 17868000)

In [192]:
model_ui_bow.wv.most_similar(positive="earthquake")

[('utc', 0.9905612468719482),
 ('volcano', 0.9877710938453674),
 ('quake', 0.9850942492485046),
 ('hawaii', 0.9794818162918091),
 ('anchorage', 0.9779492020606995),
 ('usgs', 0.9775909185409546),
 ('km', 0.9752105474472046),
 ('utah', 0.974640965461731),
 ('sismo', 0.951339602470398),
 ('sm', 0.9471018314361572)]

In [193]:
x_vals, y_vals, labels = reduce_dimensions(model_ui_bow)
plot_function(x_vals, y_vals, labels)

In [473]:
##### model.save("w2v_ui.model")
model = Word2Vec.load("w2v_ui.model")
word_vectors = model_ui.wv
word_vectors.save("w2v_ui.wordvectors")

NameError: name 'model_ui' is not defined

In [173]:
# self initialized model
model_si_bow = Word2Vec(
        X_train,
        vector_size=50,
        #sample=0, 
        alpha=1e-6,
        #min_count=3,
        window=3, 
        workers=cores)
vocab = model_si_bow.build_vocab(X_train)

model_si_bow.train(X_train,  total_examples=len(X_train), total_words=num_words, epochs=100)

(4507386, 5956000)

In [174]:
model_si_bow.wv.most_similar(positive="earthquake")

[('band', 0.4965288043022156),
 ('deck', 0.4835728704929352),
 ('minute', 0.4607042670249939),
 ('news', 0.45324498414993286),
 ('treatment', 0.45100733637809753),
 ('gate', 0.4404875636100769),
 ('girlfriend', 0.41776928305625916),
 ('damage', 0.4058554768562317),
 ('quickly', 0.4023994505405426),
 ('add', 0.3952181935310364)]

In [175]:
x_vals, y_vals, labels = reduce_dimensions(model_si_bow)
plot_function(x_vals, y_vals, labels)

## pretrained word_vectors model

In [495]:
from nltk.tokenize import word_tokenize
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [496]:
glove_file = datapath('test_glove.txt')
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)
 
model = KeyedVectors.load_word2vec_format(tmp_file)

In [497]:

# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt',encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [547]:
token = Tokenizer()
token.fit_on_texts(texts)

X = token.texts_to_sequences(texts)
print(X[:5])

[[3637, 457, 231, 84, 1356, 2406, 18], [141, 3, 183, 786, 5275, 5276, 925], [1454, 424, 1611, 397, 5277, 336, 208, 1611, 397, 293, 425], [8, 2907, 100, 208, 293, 35], [1, 2407, 139, 3638, 1257, 212, 3639, 2408, 135]]


In [548]:
token.word_index

{'get': 1,
 'like': 2,
 'fire': 3,
 'go': 4,
 'new': 5,
 'news': 6,
 'via': 7,
 'people': 8,
 'one': 9,
 'say': 10,
 'time': 11,
 'bomb': 12,
 'make': 13,
 'year': 14,
 'video': 15,
 'would': 16,
 'kill': 17,
 'us': 18,
 'disaster': 19,
 'come': 20,
 'crash': 21,
 'burn': 22,
 'flood': 23,
 'emergency': 24,
 'body': 25,
 'attack': 26,
 'day': 27,
 'know': 28,
 'see': 29,
 'love': 30,
 'look': 31,
 'police': 32,
 'home': 33,
 'take': 34,
 'california': 35,
 'storm': 36,
 'family': 37,
 'building': 38,
 'back': 39,
 'still': 40,
 'world': 41,
 'think': 42,
 'watch': 43,
 'life': 44,
 'first': 45,
 'suicide': 46,
 'want': 47,
 'train': 48,
 'man': 49,
 'car': 50,
 'bag': 51,
 'death': 52,
 'loud': 53,
 'collapse': 54,
 'let': 55,
 'scream': 56,
 'nuclear': 57,
 'retweet': 58,
 'two': 59,
 'work': 60,
 'good': 61,
 'pm': 62,
 'today': 63,
 'war': 64,
 'need': 65,
 'accident': 66,
 'dead': 67,
 'live': 68,
 'fuck': 69,
 'old': 70,
 'hiroshima': 71,
 'plan': 72,
 'full': 73,
 'wreck': 74,
 '

In [551]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size+1, 100))

for word, i in token.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [552]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_index.keys(), key=lambda word: spatial.distance.euclidean(embeddings_index[word], embedding))

In [553]:
find_closest_embeddings(embeddings_index["earthquake"])

['earthquake',
 'quake',
 'tsunami',
 'temblor',
 'magnitude',
 'disaster',
 'aftershock',
 'earthquakes',
 'tremor',
 'quakes',
 'tsunamis',
 'jolted',
 'devastated',
 'devastating',
 'epicenter',
 'aftershocks',
 'disasters',
 'sumatra',
 'devastation',
 'richter',
 'floods',
 'struck',
 'tremors',
 'katrina',
 'flood',
 'flooding',
 'storm',
 'damage',
 'undersea',
 'measuring',
 'mudslide',
 'cyclone',
 'rocked',
 'aftermath',
 'jolts',
 'rattled',
 'magnitude-6',
 'catastrophe',
 'affected',
 'sichuan',
 'scale',
 'stricken',
 'explosion',
 'typhoon',
 'jolt',
 'calamity',
 'triggered',
 'epicentre',
 'blast',
 'ravaged',
 'wenchuan',
 'tragedy',
 'catastrophic',
 'shook',
 'storms',
 'occurred',
 'morakot',
 '7.6',
 'waves',
 'seismologists',
 '6.8',
 'seismic',
 '7.2',
 '7.2-magnitude',
 'hurricane',
 'magnitude-5',
 'tidal',
 'magnitude-7',
 'damaged',
 'tornado',
 '6.2',
 'spill',
 'temblors',
 'massive',
 'tangshan',
 '7.6-magnitude',
 'rainstorm',
 'landslides',
 'rattles',


In [554]:

tsne = TSNE(n_components=2, random_state=0)
words =  list(embeddings_dict.keys())
vectors = [embeddings_dict[word] for word in words]
Y = tsne.fit_transform(vectors[:1000])
plt.scatter(Y[:, 0], Y[:, 1])

for label, x, y in zip(words, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords="offset points")
plt.show()

NameError: name 'embeddings_dict' is not defined

In [None]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

In [555]:
# pretrained model
model_pt = Word2Vec(glove_vectors,
                     window=3,
                     vector_size=50,
                     alpha=0.1, 
                     min_alpha=0.001, 
                     workers=cores-1)
vocab = model_pt.build_vocab(texts)

model_pt.train(texts, total_examples=len(tweets_df), epochs=10)

NameError: name 'glove_vectors' is not defined

In [None]:
model_pt.wv.most_similar(positive="earthquake")

In [None]:
pretrained_w2v_model.save("w2v_pt.model")
model_pt = Word2Vec.load("w2v_pt.model")
word_vectors_pt = model_pt.wv
word_vectors_pt.save("w2v_pt_wv.wordvectors")

In [None]:
model_pt.train(train_X, total_examples=pretrained_w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
wv = KeyedVectors.load("w2v_pt_wv.wordvectors", mmap='r')

In [None]:
wv

In [None]:
cores = multiprocessing.cpu_count()

model_dbow = Doc2Vec(dm=0, vector_size=100, window=4, sample=1e-4, alpha=0.007, dbow_words=1, workers=cores)

model_dbow.build_vocab([x for x in train_tagged.values])

In [None]:
def plot_tweet(vector, line):
    fig, ax = plt.subplots(1,1, figsize=(12, 6))
    ax.tick_params(axis='both', 
                   which='both',
                   left=False, 
                   bottom=False,
                   top=False,
)
    ax.grid(False)
    print(line)
    ax.bar(range(len(vector)), vector, 0.5)

In [None]:
def show_tweet(index_number):
    line = tweets_df[tweets_df.index==index_number].tweets
    twt_vector = model_si[index_number]
    plot_tweet(twt_vector, line)

In [None]:
show_tweet(17)

In [None]:
show_tweet(58)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

In [None]:

# the models that you want to compare
models = {
    'clf_randforest': RandomForestClassifier(),
    'clf_knn': KNeighborsClassifier(),
    #'clf_svm': SVC(),
    #'clf_xgb': XGBClassifier(),
    #'clf_ada': AdaBoostClassifier(),
 
}

# the optimisation parameters for each of the above models
params = {
    'clf_randforest':{ 
        'n_jobs': [-1],
        'max_features': ['sqrt'],
        'criterion': ['entropy'],
        'penalty': [10, 25, 36, 64],
       
         },
    
    'clf_knn': {
        'weights': [ 'distance'],
        'algorithm': ['ball_tree', 'kd_tree']
         },
    

    

#     'clf_svm' : {
#         'kernel': ['rbf', 'sigmoid'], 
#         'degree' : [1, 2, 3, 4],
        
#         'C': [.01, .1, 1, 10, 100, 1000],
#          },
    
    'clf_xgb' : {
        'objective': ['binary:logistic'],
        'learning_rate': [0.001, 0.05, 0.1],
        'alpha': [0.001, 0.3, 0.05]
        },
        

                   
                 }

In [None]:
def fit_search(X_data, y_data):
        """
        fits the list of models to the training data, thereby obtaining in each 
        case an evaluation score after GridSearchCV cross-validation
        """
        for name in models.keys():
            est = models[name]
            est_params = params[name]
            gscv = GridSearchCV(estimator=est, param_grid=est_params, cv=10)
            gscv.fit(X_data, y_data)
            print("best parameters are: {}".format(gscv.best_estimator_))
            y_pred = gscv.predict(X_data)
            print(classification_report(y_data, y_pred))

In [None]:

import warnings
warnings.filterwarnings('ignore') 
fit_search(X_train, y_train)

In [None]:
logreg = LogisticRegression(n_jobs=1, C=1e3)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [None]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [None]:
tweets_df

## train, test = train_test_split(train_df, random_state=42, test_size=0.2)

In [None]:
cores = multiprocessing.cpu_count()
cores

In [None]:
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def tsnescatterplot(model, word, list_names):

    arrays = np.empty((0, 300), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    reduc = PCA(n_components=50).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))


In [None]:
tsnescatterplot(w2v_model, 'fire', ['earthquake', 'california', 'africabaze'])

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [None]:

# verify shape of data
print('Test label shape:', np.shape(y_test))
print('Train label shape:', np.shape(y_train))

print('Test shape:', np.shape(X_test))
print('Train shape:', np.shape(X_train))

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

In [None]:
train_X = train_df.filter(['combined_tokens'], axis=1)
train_y = train_df.filter(['target'], axis=1)
# convert target data with labelencoder
train_y= train_y.astype('category')
le = LabelEncoder()   
y = np.array(train_y)

le.fit_transform(y)
y = y.ravel()
# define independent variable
X = train_X
#X = X.reshape(-1,1)
# split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                     test_size=.2, stratify=y)

X_train, X_test, X_val, X_val = train_test_split(X, y, random_state=42,
                                                     test_size=.1, stratify=y)

In [None]:
#(ngram_range=(1,2)
pipeline = Pipeline([ ('vect', CountVectorizer(ngram_range=(1,2))), 
                     ('tfidf', TfidfTransformer(use_idf=True))])
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)



counter = Counter(y_train)
print(counter)

#assign variable to np.shape tuple, width dimension
input_dimension = np.shape(X_train)
count_terms = input_dimension[1]
print(count_terms)

In [None]:
vocab_size = 10000  # Only consider the top 20k words
num_tokens_per_example = 15  # Only consider the first 200 words of each movie review
X_train = keras.preprocessing.sequence.pad_sequences(
    X_train, maxlen=num_tokens_per_example
)
x_val = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=num_tokens_per_example) 

In [None]:
from tensorflow import keras
