In [None]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle
import numpy as np

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale
import re
from nltk.tokenize import TweetTokenizer
import gc

from keras.models import Sequential
from keras.layers import Activation
from keras.callbacks import EarlyStopping 
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam
from keras.layers.core import Dense, Dropout, Flatten
import keras.backend as K




In [None]:
neg_twit_path='data/train_neg_full.txt'
pos_twit_path='data/train_pos_full.txt'
neg_twit_cleaned='cleaned twits/train_neg.txt'
pos_twit_cleaned='cleaned twits/train_pos.txt'
gloveFile='Glove embeddings/glove.twitter.27B.100d.txt'

tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)


In [None]:
def cleanTweets(tweets):
    
    remove_numbers_start = r'\d+(\w*|\d*)'
    remove_numbers_end = r'(\w*|\d*)\d+'
    remove_hashtags = r'#(\w*|#*|\d*)'
    remove_underscores = r'_(\w*)'
    remove_punctuation = r"[.,;'?():-_!$&%{}~/|]"
    remove_quotation = r'"'
    remove_math_ops = r'[-+.^:,*]'  
    remove_delimeted_words = r'<[^>]+>'
    
    combined_pattern = r'|'.join((remove_numbers_start,remove_numbers_end,remove_hashtags,remove_underscores,remove_quotation,remove_delimeted_words))
    combined_pattern2 = r'|'.join((remove_punctuation,remove_math_ops))
    cleaned_tweets1 = re.sub(combined_pattern,'',tweets)
    cleaned_tweets2 = re.sub(combined_pattern2,'',cleaned_tweets1)
    return cleaned_tweets2

In [None]:
# replaces all words having an instance of a character more than 2 timess
def repl(matchObj):	
    char = matchObj.group(1)
    return "%s%s" % (char, char)

In [None]:
def replaceRepeatingCharacters(tweets):
    pattern = re.compile(r"(\w)\1+")
    corrected_words = pattern.sub(repl,tweets)
    return corrected_words

In [None]:
def loadData(path,inputF, outputTweets):
    
    inputSet = path + inputF
    outputSetA = outputTweets
    
    #read data sets
    file = open(inputSet)
    raw_data = file.read()
    
    cleaned_data = cleanTweets(raw_data)
    strip_repeated_chars = replaceRepeatingCharacters(cleaned_data)
    
    writeToFile(strip_repeated_chars,outputSetA)


In [None]:
def writeToFile(data,outputSet):
    result = open(outputSet, 'w')
    if (type(data) is list):
        for item in data:
            result.write("%s\n" % item)
    elif (type(data) is str):
        # remove whitespaces generated by data cleaning
        newform = re.sub(' +',' ',data.lstrip())
        result.write(newform)

In [None]:
loadData('../',neg_twit_path,neg_twit_cleaned)
loadData('../',pos_twit_path,pos_twit_cleaned)
gc.collect()

In [None]:
print "Loading Glove Model"
f = open(gloveFile,'r')
glove_model = {}
for line in tqdm(f):
    splitLine = line.split()
    word = splitLine[0]
    embedding = np.array([float(val) for val in splitLine[1:]])
    glove_model[word] = embedding
print "Done.",len(glove_model)," words loaded!"

In [None]:
def tokenize(tweet):
    try:
        tweet = unicode(tweet.decode('utf-8').lower())
        tokens = tknzr.tokenize(tweet)
        return tokens
    except:
        return 'NC'

In [None]:
def file_to_tokenized_twits(filename):
    data = open(filename, 'rb')
    tokenized_tweet_all = []
    for tweet in tqdm(data):
        tokenized_tweet=tokenize(tweet)
        tokenized_tweet_all.append(tokenized_tweet)
    return np.array(tokenized_tweet_all)

In [None]:
pos_tokenized_twits=file_to_tokenized_twits(pos_twit_cleaned)
neg_tokenized_twits=file_to_tokenized_twits(neg_twit_cleaned)
del pos_twit_cleaned
del neg_twit_cleaned
gc.collect()

In [None]:
print pos_tokenized_twits[0:100]

In [None]:
def merge_dataset(pos_data, neg_data):
    X = np.concatenate((pos_data, neg_data))
    y = np.array([1] * pos_data.shape[0] + [0] * neg_data.shape[0])
    assert len(y) == X.shape[0]
    assert X.shape[0] == pos_data.shape[0] + neg_data.shape[0]
    return X, y


In [None]:
X, y = merge_dataset(pos_tokenized_twits,neg_tokenized_twits)
gc.collect()

In [None]:
print X[1:1191748]
print y[1:1191748]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print X_train.shape, X_test.shape 

In [None]:
def define_tf_idf(corpus):
    print 'building tf-idf matrix ...'
    vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=1)
    matrix = vectorizer.fit_transform([tweet for tweet in corpus])
    tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
    print 'vocab size :', len(tfidf)
    return tfidf

In [None]:
tfidf=define_tf_idf(X)

In [None]:
def find_vector_representation_of_tweets(tokenized_tweets,dimension,tfidf):
    tweets_embeddings = np.zeros((len(tokenized_tweets), dimension))
    vec = np.zeros(dimension).reshape((1, dimension))
    count = 0.
    deficit=0
    for i, tokenized_tweet in enumerate(tokenized_tweets):
        for word in tokenized_tweet:
            try:
                vec += glove_model[word].reshape((1, dimension)) * tfidf[word]
                count += 1.
            except KeyError:
                deficit+=1
                continue
        if count != 0:
            vec /= count
            
        tweets_embeddings[i, :] = vec
    
    print "deficit: ",deficit
    return tweets_embeddings


In [None]:
embedded_X_train=find_vector_representation_of_tweets(X_train,100,tfidf)
embedded_X_test=find_vector_representation_of_tweets(X_test,100,tfidf)


In [None]:
embedded_X_train[0:100]

In [None]:
embedded_X_train = scale(embedded_X_train)
embedded_X_test = scale(embedded_X_test)


In [None]:
embedded_X_train[0:100]

In [None]:
idx = list(range(embedded_X_train.shape[0]))
idx2 = list(range(embedded_X_test.shape[0]))

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=100))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(embedded_X_train[idx,:], y_train[idx], epochs=10, batch_size=32, validation_data=(embedded_X_test[idx2,:], y_test[idx2]),verbose=2)
