In [1]:
import numpy as np
import pandas as pd
import pickle
import pylab as pl
import scipy.sparse as sp
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import svm

# Loading Vocabullary

A function that reads vocabulary created by cleaning the datasets

In [2]:
def loadVocab(fileName):

# load file into list    
    with open(fileName) as f:
        content = f.readlines()
        content = [x.strip() for x in content] 
        return content

# Count Vector

This is a matrix that contains for each tweet the count of words in the vocabulary set that occured

In [3]:
def createCountVector(fileName1,fileName2):

    vocab_pos = loadVocab(fileName1)
    vocab_neg = loadVocab(fileName2)
    # find the combination of both vocabularies and remove common words
    combined_vocab = np.unique(vocab_pos + vocab_neg)
    # load the tweets 
    raw_text1 = pd.read_table('../data/train_pos.txt',header = None,names=['tweets'])
    raw_text2 = pd.read_table('../data/train_neg.txt',header = None,names=['tweets'])
    positiveTweets = raw_text1['tweets'].apply(str)
    negativeTweets = raw_text2['tweets'].apply(str)
    dataset = pd.concat([positiveTweets,negativeTweets])
    
    
    
    # labels/output map : positive-> 0 and negative -> 1
    labels = np.zeros((positiveTweets.shape[0] + negativeTweets.shape[0],),dtype=int)
    labels[negativeTweets.shape[0]:] = 1
    
    #occurence matrix representation
    cv = CountVectorizer(vocabulary=combined_vocab)
    occurence = cv.fit_transform(dataset)
    
    

    #frequency of word matrix representation
    #tf_transformer = TfidfTransformer(use_idf=False).fit(occurence)
    #frequency = tf_transformer.transform(occurence)
    
    
    return occurence,labels

# VISUALIZATION

Visualizing counts of words in tweets against labels using PCA and K-means

In [4]:
def word_vis(X,labels):

    pca = TruncatedSVD(n_components=2)
    projected = pca.fit_transform(X)
    kmeans = KMeans(n_clusters=2, random_state=0).fit(projected)
    first_cluster_index = np.where(kmeans.labels_ == 0)[0]
    second_cluster_index = np.where(kmeans.labels_ == 1)[0]
    clusterA = projected[first_cluster]
    clusterB = projected[second_cluster]
    c1 = pl.scatter(clusterA[:,0],clusterA[:,1],c='r',marker='+')
    c2 = pl.scatter(clusterB[:,0],clusterB[:,1],c='g',marker='o')
    pl.legend([c1, c2], ['ClusterA', 'ClusterB'])
    pl.show()

In [5]:
def embeddings_vis(embeddings,labels):
    
    sentence_embed = generate_sentences(embeddings)
    kmeans = KMeans(n_clusters=2, random_state=0).fit(projected)
    first_cluster_index = np.where(kmeans.labels_ == 0)[0]
    second_cluster_index = np.where(kmeans.labels_ == 1)[0]
    clusterA = projected[first_cluster]
    clusterB = projected[second_cluster]
    c1 = pl.scatter(clusterA[:,0],clusterA[:,1],c='r',marker='+')
    c2 = pl.scatter(clusterB[:,0],clusterB[:,1],c='g',marker='o')
    pl.legend([c1, c2], ['ClusterA', 'ClusterB'])
    pl.show()
    

# PREDICTION 

First module : normalize count vector and perform pca on it (resulting in uncorrelated features) then feed it to a linear model(logistic regression) , non-linear model(svm) 

Second module: apply all previously mentioned models on the full matrix

In [6]:
count,labels = createCountVector("../vocabulary/train_pos_vocab.txt","../vocabulary/train_neg_vocab.txt")
#visualization(count,labels)

In [7]:
embed = np.load('../sample_code/embeddings.npy')

In [12]:
print(count.shape)
print(embed.shape)
       

(196970, 85249)
(21161, 20)
(21161, 21161)


In [7]:
def normalize(a):
    
    col_sum = np.array(a.sum(axis=0).squeeze())[0]
    col_nonzero = a.getnnz(axis=0)
    col_nonzero[np.where(col_nonzero ==0)] = 1
    col_avg = col_sum/col_nonzero
    diagonal_matrix = sp.diags(col_avg, 0)
    b = a.copy()
    b.data = np.ones_like(b.data)
    normalized_matrix = a - b*diagonal_matrix
    return normalized_matrix

In [8]:
def train_test_split(data,labels):
    
    indices = np.arange(0,data.shape[0])
    np.random.shuffle(indices)
    data_split = int(0.8 * indices.shape[0])
    train_index = indices[:data_split]
    test_index = indices[data_split:]

    train_data = data[train_index,:]
    train_label = labels[train_index]
    test_data =  data[test_index,:]
    test_label =  labels[test_index] 
    
    return train_data,train_label,test_data,test_label

In [9]:
def prediction(data,modeltype):
    
    res = normalize(data)
       
    for component in range(10,100,10):
        pca = TruncatedSVD(n_components=component)
        projected = pca.fit_transform(res)
        train_data,train_label,test_data,test_label = train_test_split(projected,labels)
        if modeltype == "linear":
            logistic = LogisticRegression(C=1e5)
            model = logistic.fit(train_data,train_label)
            Y_pred = model.predict(test_data)
            
        elif modeltype == "nonlinear":    
        
            rbf_svc = svm.SVC(kernel='rbf')
            model = rbf_svc.fit(train_data, train_label) 
            Y_pred = model.predict(test_data)
        
        accuracy = accuracy_score(test_label,Y_pred)
        print(str(component) + " " + str(accuracy))           

In [19]:
prediction(count)    

(21161, 21161)


In [14]:
train_data,train_label,test_data,test_label = train_test_split(count,labels)
logistic = LogisticRegression(C=1e5)
model = logistic.fit(train_data,train_label)
Y_pred = model.predict(test_data)
accuracy = accuracy_score(test_label,Y_pred)


0.774128039803


In [None]:
train_data,train_label,test_data,test_label = train_test_split(count,labels)
rbf_svc = svm.SVC(kernel='rbf')
model = rbf_svc.fit(train_data, train_label) 
Y_pred = model.predict(test_data)
accuracy = accuracy_score(test_label,Y_pred)