# TEXT CLASSIFICATION

### MERGING NEGATIVE AND POSITIVE DATASETS TOGETHER

In [1]:
filenames = ['twitter-datasets/train_neg.txt', 'twitter-datasets/train_pos.txt'] # Names of the files to open
with open('twitter-datasets/train_total.txt', 'w') as outfile: # Names of the output file
    
    # Loop in the input files
    for fname in filenames:  
        
        # Open the input files and save each line in the output one
        with open(fname) as infile: 
            
            # loop on all the lines in a specific file
            for line in infile:
                
                # write this line in another file
                outfile.write(line)

### IMPORT PACKAGES

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from collections import defaultdict

from keras.regularizers import l2, activity_l2
from keras.models import Sequential, model_from_json
from keras.models import load_model
from keras.optimizers import SGD
from keras.optimizers import adam
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import Model
from keras.constraints import maxnorm
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression

from misc_tools import *
from sklearn.svm import SVC
from sklearn import svm
import pickle

import pandas
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from twitter_processing import *

Using TensorFlow backend.


ImportError: No module named 'nltk'

We first Load tweets, vocabulary, co-occurence matrix and word embedding created in the DataCreation.ipynb File.

In [2]:
# load the word embedding representation of each word
w_emb = np.load('embeddings_nltk.npy')

# load vocab
with open('vocab_nltk.pkl', 'rb') as f:
    tot_vocab = pickle.load(f)

# load the coocurrence matrix     
with open('cooc_nltk.pkl', 'rb') as f:
    cooc = pickle.load(f)

## Data Processing

## Load Glove data (D = 20)

In [6]:
# load the train file tweets Glove representation
X = np.load('tweets_emb_nltk.npy')
y = np.load('tweets_sol_nltk.npy')

# load the full file tweets Glove representation
X_full = np.load('tweets_emb_nltk_full.npy')
y_full = np.load('tweets_sol_nltk_full.npy')

# load the test file tweets Glove representation
X_test = np.load('tweets_emb_nltk_test.npy')

## Load Word2Vec Data

In [None]:
# Load the train file tweets word2vec representation
X = np.load('X_w2v.npy')
y = np.load('y_w2v.npy')

# Apply PCA to our features to reduce to 50 instead of 300
pca = PCA(n_components = 50 , svd_solver = 'full')
pca.fit(X)
X = pca.transform(X)

#  Fitting Models

## Logistic Regression

In [None]:
# Array of the different polynomial degrees with which we are fitting our model
degree = [1,2,3,4]

# for each polynomial degree
for d in degree:
    
    # We build our polynomial data representation 
    phi = build_poly(X, d)
    
    # normialize our data
    phi = normalizator(phi)
    
    # Penalty L2 and Stochastic average gradient descent
    model_logistic = LogisticRegression(penalty = 'l2' , C = 10, solver='sag') 
    scores = cross_val_score(model_logistic, phi, y, cv = 5)
    print('Avg score: ' , np.mean(scores) , ' +/- ' , np.std(scores))

## SVM

In [None]:
# normalize our data 
X = normalizator(X)

# different values of our gamma hyper parameter
gam_vec = [0.00005, 0.001, 0.1]

# for each value of gamma
for g in gam_vec:
    
    # initialize our model
    kernel_svm = SVC(gamma = g) # by default radial kernel. Set 'kernel=poly' for polynomial one
    
    # reduce the train set as Svm takes a lot of time to compile
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.9,random_state=1)
    
    # fit our models with reduced train data
    kernel_svm.fit(X_train , y_train)
    
    # 5 times cross validation with our kernel_svm
    scores = cross_val_score(kernel_svm,X_train,y_train,cv=5)
    print('Mean score: ', np.mean(scores), ' +/- ', np.std(scores))


#### BUILDING THE TEST SET

In [None]:
# preprocess our test set
tweets_data_test, count_test = preprocess("twitter-datasets/test_data.txt")

# process our test set (vector representation)
X_test = build_test_set(tot_vocab , w_emb , tweets_data_test)

# normalize the test vectors
X_test = normalizator(X_test)

#### SAVING THE PREDICTIONS

In [None]:
# predict 
y_test = model.predict(X_test)

pred_file = 'predict.csv'
create_csv_submission(y_test,pred_file)

# Neural Nets

### For GloVe Data

In [10]:
# split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.999, random_state=1)


In [11]:
# Adapt Glove with Neural networks

def create_model_glove(neurons=15,neurons2=5,
                     init_mode='he_uniform',activation='sigmoid',
                     learn_rate=0.1,momentum=0.5,dropout_rate=0.3,
                     weight_constraint=0, weight_regularizer=0.0001):
    
   
    # create model
    model = Sequential()
    
    # add the input, hidden and output layers 
    model.add(Dense(neurons, input_dim = 20, init = 'he_uniform', activation = activation, W_regularizer = l2(weight_regularizer)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons2, init = 'he_uniform', activation = activation))
    model.add(Dense(1, init ='he_uniform', activation = activation))
    
    # Compile model
    optimizer = SGD(lr = learn_rate, momentum = momentum)
    model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ['accuracy'])
    
    return model

In [12]:
# grid search Learn rate and momentum
model = KerasClassifier(build_fn = create_model_glove, nb_epoch = 10, batch_size = 20, verbose = 0)

# define the grid search parameters
learn_rate = [0.3, 0.5, 1]
momentum = [0,0.1, 0.5]
param_grid = dict(learn_rate = learn_rate, momentum = momentum)

# create and fit gridSearch
grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs = -1)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.557789 using {'momentum': 0.1, 'learn_rate': 1}
0.442211 (0.048098) with: {'momentum': 0, 'learn_rate': 0.3}
0.442211 (0.048098) with: {'momentum': 0.1, 'learn_rate': 0.3}
0.452261 (0.050341) with: {'momentum': 0.5, 'learn_rate': 0.3}
0.437186 (0.041320) with: {'momentum': 0, 'learn_rate': 0.5}
0.437186 (0.041320) with: {'momentum': 0.1, 'learn_rate': 0.5}
0.507538 (0.074808) with: {'momentum': 0.5, 'learn_rate': 0.5}
0.507538 (0.074808) with: {'momentum': 0, 'learn_rate': 1}
0.557789 (0.048098) with: {'momentum': 0.1, 'learn_rate': 1}
0.492462 (0.074808) with: {'momentum': 0.5, 'learn_rate': 1}


In [None]:
# Tune the Neuron Activation Function
model = KerasClassifier(build_fn = create_model_glove, nb_epoch = 10, batch_size=20, verbose = 0)
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(activation = activation)
grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs = -1)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



In [None]:
### Tune the Number of Neurons in the Hidden Layer

## create model
model = KerasClassifier(build_fn = create_model_glove, nb_epoch = 10, batch_size = 20, verbose = 0)

# define the grid search parameters
neurons = [5, 10, 15, 20]
neurons2= [5]
param_grid = dict(neurons = neurons, neurons2 = neurons2)
grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs = -1)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.asarray(scores).reshape(4, 1)
stds = np.asarray(grid_result.cv_results_['std_test_score'])
stds = stds.reshape(4, 1)
neurons = np.asarray(neurons).reshape(4, 1)
print(neurons.shape)
plt.scatter(neurons, scores)
plt.errorbar(neurons,scores,yerr=stds, linestyle="None")
plt.legend()
plt.xlabel('Neurons on layer 1')
plt.ylabel('Mean score')
plt.grid()
plt.show()

In [None]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.asarray(scores).reshape(5, 1)
stds = np.asarray(grid_result.cv_results_['std_test_score'])
stds = stds.reshape(5, 1)

neurons=np.asarray(neurons).reshape(5, 1)
print(neurons.shape)
plt.scatter(neurons, scores)#,stds)#, label='N2: ' + str(i))
plt.errorbar(neurons,scores,yerr=stds, linestyle="None")
plt.legend()
plt.xlabel('Neurons on layer 1')
plt.ylabel('Mean score')
plt.grid()
plt.show()

In [None]:
##Dropout rate & Weight constraint
model = KerasClassifier(build_fn=create_model_glove, nb_epoch=10, batch_size=20, verbose=0)

# define the grid search parameters
weight_constraint = [0]
weight_regularizer= [0.00001,0.0001,0.001,0.01,0.1,1]
dropout_rate = [0.3]
param_grid = dict(dropout_rate=dropout_rate,weight_regularizer=weight_regularizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# summarize results
scores = [x[1] for x in grid.grid_scores_]
scores = np.asarray(scores).reshape(6, 1)
stds = np.asarray(grid_result.cv_results_['std_test_score'])
stds = stds.reshape(6, 1)

neurons=np.asarray(weight_regularizer).reshape(6, 1)


plt.scatter(weight_regularizer, scores)
plt.errorbar(weight_regularizer,scores,yerr=stds, linestyle="None")
plt.legend()
plt.xlabel('L2 Regularizer parameter')
plt.ylabel('Mean score')
plt.grid()
plt.xscale('log', nonposy='clip')
plt.show()


In [None]:
## INITIALISATION OF W

#create model
model = KerasClassifier(build_fn=create_model, nb_epoch=10, batch_size=100, verbose=0)

# define the grid search parameters
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
param_grid = dict(init_mode=init_mode)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

### For Word2Vec Data

In [None]:
# with data from word2vec
X=np.load('X_w2v.npy')
y=np.load('y_w2v.npy')

X_full = np.load('X_w2v_full.npy')
y_full = np.load('y_w2v_full.npy')

X_test = np.load('X_test_w2v.npy')
print(X.shape)
print(y.shape)

Step1 : Principal component via Singular value decompo

In [None]:
# we kept only the 50 first that keep the 98% of variance
pca = PCA(n_components=50,svd_solver='full')
pca.fit(X)
X1 = pca.transform(X)
X1_test = pca.transform(X_test)

# print the percentage of variance captured with those features
print(sum(pca.explained_variance_ratio_))
print(X1.shape)

In [None]:
X1_full = pca.transform(X_full)

Step2 : scaling data

In [None]:
# Step2 : scaling data
X2 = preprocessing.scale(X1)
X2_test = preprocessing.scale(X1_test)
X2_full = preprocessing.scale(X1_full)

In [None]:
# split data 
X_train,X_test,y_train,y_test=train_test_split(X2,y,test_size=0.6,random_state=1)

print(X_train.shape)
print(y_train.shape)

In [None]:
# FOR SGD THINGS for word2vec embedding

def create_model_w2v(neurons=1000,neurons2=100,  ##20 et 10
                     init_mode='he_uniform',activation='sigmoid',
                     learn_rate=0.1,momentum=0.5,dropout_rate=0.2,
                     weight_constraint=2,weight_regularizer=2):
    
    ## the input layer must have the input_dim numbers of input
    # create model
    model = Sequential()
    model.add(Dense(neurons, input_dim=50, init='he_uniform', activation=activation,W_regularizer=l2(weight_regularizer),W_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons2, init='he_uniform', activation=activation))
    model.add(Dense(1, init='he_uniform', activation=activation))
    # Compile model
    optimizer = SGD(lr=learn_rate, momentum=momentum)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    #categorical_crossentropy
    return model

In [None]:
# Dropout rate & Weight constraint
model = KerasClassifier(build_fn=create_model_w2v, nb_epoch=5, batch_size=20, verbose=0)

# Define the grid search parameters
weight_constraint = [0,1,2]
dropout_rate = [0,0.3]
weight_regularizer=[0,1,2,3]
param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint,weight_regularizer=weight_regularizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))b

In [None]:
# GRID SEARCH Learn rate and momentum
model = KerasClassifier(build_fn=create_model_w2v, nb_epoch=10, batch_size=20, verbose=0)

# define the grid search parameters
learn_rate = [0.3,0.5,1]
momentum = [ 0,0.1,0.5]
param_grid = dict(learn_rate=learn_rate, momentum=momentum)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
### Tune the Number of Neurons in the Hidden Layer

# create model
model = KerasClassifier(build_fn=create_model_w2v, nb_epoch=5, batch_size=20, verbose=0)

# define the grid search parameters
neurons = [20,1000]
neurons2= [5,100]
param_grid = dict(neurons=neurons,neurons2=neurons2)
grid = GridSearchCV(estimator=model, param_grid=param_grid,cv=2, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# summarize results
scores = [x[1] for x in grid.grid_scores_]
scores = np.asarray(scores).reshape(7, 1)
stds = np.asarray(grid_result.cv_results_['std_test_score'])
stds = stds.reshape(7, 1)

neurons=np.asarray(neurons).reshape(7, 1)
print(neurons.shape)
plt.scatter(neurons, scores)#,stds)#, label='N2: ' + str(i))
plt.errorbar(neurons,scores,yerr=stds, linestyle="None")
plt.legend()
plt.xlabel('Neurons on layer 1')
plt.ylabel('Mean score')
plt.grid()
plt.show()


In [None]:
# Dropout rate & Weight constraint
model = KerasClassifier(build_fn=create_model_w2v, nb_epoch=5, batch_size=20, verbose=0)

# define the grid search parameters
weight_constraint = [2,3]
dropout_rate = [0.2,0.3]
param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X2, y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))b

In [None]:
def save_model(model_tt):
    
    # saving model
    json_model = model_tt.model.to_json()
    open('model_architecture.json', 'w').write(json_model)
    
    # saving weights
    model_tt.model.save_weights('model_weights.h5', overwrite=True) 

def load_model():
    
    # loading model
    model = model_from_json(open('model_architecture.json').read())
    model.load_weights('model_weights.h5')
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [None]:
# Fit the model
mybestmodel = KerasClassifier(build_fn=create_model_glove)
mybestmodel.fit(X1_full, y_full,validation_split=0.2, nb_epoch=50, batch_size=20)  #can add verbose=0 for no wait bar
save_model(mybestmodel)

In [None]:
# Cross Validation on full data:
mybestmodel.score(X2_full,y_full)

In [None]:
y_pred = mybestmodel.predict(X2_test)
y_rendu = []

for i in range(len(y_pred)):
    
    if y_pred[i] >= 0.5:
        y_rendu.append(1)
    else: 
        y_rendu.append(-1)
        
OUTPUT_PATH = 'prediction.csv' 
ids_test = [i+1 for i in range(len(y_rendu))]
create_csv_submission(ids_test, y_rendu, OUTPUT_PATH)