In [None]:
from helpers_submission import *
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential, Model
from keras.layers import Reshape, Dense, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dropout, LSTM, SpatialDropout1D, GlobalMaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn import svm
import numpy as np

## **SVM**

In [None]:
#NB : test and train dataset must be pre-processed before runing this file

feat_vectors, labels, _, pca_model, _ = vectorization_train_submission(embedding_dim=350)
test_vectors, _ = vectorization_test_submission(pca_model, 0, embedding_dim=350)


scaler = StandardScaler()
feat_vectors = scaler.fit_transform(feat_vectors)
test_vectors = scaler.transform(test_vectors)

loss_ = 'hinge'
intercept = False
regulariser = 0.5
nb_features = feat_vectors.shape[1]

classifier = svm.LinearSVC(fit_intercept= intercept ,loss=loss_,C=regulariser).fit(feat_vectors,labels)
predictions = classifier.predict(test_vectors)

print(predictions.shape)
predictions = np.where(predictions==0, -1, predictions)

ids_test = range(1, len(predictions)+1)


# OUTPUT SUBMISSION FILE
OUTPUT_PATH = '../results/BuzzLastyear_TweetPredictions_SVM.csv'
create_csv_submission(ids_test, predictions, OUTPUT_PATH)


## **LSTM**

In [None]:
#NB : test and train dataset must be pre-processed before runing this file

_, labels, feat_matrices, pca_model, maximal_length = vectorization_train_submission(embedding_dim=350)
_, test_matrices = vectorization_test_submission(pca_model, maximal_length, embedding_dim=350)


# Convert labels to 2 categorical variables, to be able to use categorical_crossentropy loss function
labels = to_categorical(labels)

X_train, X_test, y_train, y_test = train_test_split(padded_matrices, labels, test_size=0.1, random_state=1)

model_lstm = Sequential()
model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(420, dropout = 0.2, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(2, activation = 'softmax'))
model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

history = model_lstm.fit(
    X_train,
    y_train,
    epochs = 260,
    batch_size = 512,
    validation_data = (X_test, y_test)
)


predictions = model_lstm.predict(test_matrices)
predictions=np.argmax(predictions, axis=1)
predictions = np.where(predictions==0, -1, predictions)
ids_test = range(1, len(predictions)+1)

plot_history(history)


# OUTPUT SUBMISSION FILE
OUTPUT_PATH = '../results/BuzzLastyear_TweetPredictions_LSTM.csv'
create_csv_submission(ids_test, predictions, OUTPUT_PATH)


## **Classical-CNN**

In [None]:
# Load Test dataset
import numpy as np

import numpy as np
import pickle
from sklearn.preprocessing import  StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from keras.models import Sequential
from keras.layers import Reshape, Dense, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dropout, LSTM, SpatialDropout1D, GlobalMaxPooling1D, Input, concatenate, Activation, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#NB : test and train dataset must be pre-processed before runing this file

_, labels, feat_matrices, pca_model, maximal_length = vectorization_train_submission(embedding_dim=30)
_, test_matrices = vectorization_test_submission(pca_model, maximal_length, embedding_dim=30)

# Convert labels to 2 categorical variables, to be able to use categorical_crossentropy loss function
labels = to_categorical(labels)


BATCH_SIZE = 200
EPOCHS = 260


model = Sequential()
model.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu')) #2-grams
model.add(Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu')) #3-grams
#model.add(Conv1D(filters=size, kernel_size=4, padding='valid', activation='relu')) #4-grams
#model.add(Conv1D(filters=size, kernel_size=5, padding='valid', activation='relu')) #5-grams
#model.add(Conv1D(filters=size, kernel_size=6, padding='valid', activation='relu')) #6-grams
model.add(MaxPooling1D(2))
model.add(Dropout(0.5))
model.add(Conv1D(50, 10, activation='relu', padding='same'))
model.add(Dropout(0.5))
model.add(Conv1D(50, 10, activation='relu', padding='same'))
model.add(GlobalAveragePooling1D())

model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
                optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train,
                y_train,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                validation_data=(X_test,y_test),
                verbose=1)

predictions = model_m.predict(test_matrices)
predictions = np.where(predictions==0, -1, predictions)
ids_test = range(1, len(predictions)+1)

plot_history(history)


# OUTPUT SUBMISSION FILE
OUTPUT_PATH = '../results/BuzzLastyear_TweetPredictions_ClassicalCNN.csv'
create_csv_submission(ids_test, predictions, OUTPUT_PATH)


## **Multi-channel CNN**

In [None]:
# Load Test dataset
import numpy as np

import numpy as np
import pickle
from sklearn.preprocessing import  StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from keras.models import Sequential
from keras.layers import Reshape, Dense, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dropout, LSTM, SpatialDropout1D, GlobalMaxPooling1D, Input, concatenate, Activation, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#NB : test and train dataset must be pre-processed before runing this file

_, labels, feat_matrices, pca_model, maximal_length = vectorization_train_submission(embedding_dim=30)
_, test_matrices = vectorization_test_submission(pca_model, maximal_length, embedding_dim=30)

# Convert labels to 2 categorical variables, to be able to use categorical_crossentropy loss function
labels = to_categorical(labels)


BATCH_SIZE = 200
EPOCHS = 260


tweet_input = Input(shape=(MAXIMAL_TWEET_LENGTH, EMBEDDING_SIZE))

bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_input)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_input)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
#fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_input)
#fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
#fivegram_branch = Conv1D(filters=100, kernel_size=5, padding='valid', activation='relu', strides=1)(tweet_input)
#fivegram_branch = GlobalMaxPooling1D()(fivegram_branch)
#merged = concatenate([bigram_branch, trigram_branch, fourgram_branch, fivegram_branch], axis=1)
merged = concatenate([bigram_branch, trigram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.5)(merged)
merged = Dense(2)(merged)
output = Activation('softmax')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

model.compile(loss='categorical_crossentropy',
                optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train,
                y_train,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                validation_data=(X_test,y_test),
                verbose=1)

predictions = model_m.predict(test_matrices)
predictions = np.where(predictions==0, -1, predictions)
ids_test = range(1, len(predictions)+1)

plot_history(history)


# OUTPUT SUBMISSION FILE
OUTPUT_PATH = '../results/BuzzLastyear_TweetPredictions_ClassicalCNN.csv'
create_csv_submission(ids_test, predictions, OUTPUT_PATH)
