In [17]:
# !pip install joblib

import numpy as np
import matplotlib.pyplot as plt

In [18]:
def load_data(filename):
    import csv
    data=[]
    dataNames=[]
    with open(filename) as csv_file:
        csv_reader=csv.reader(csv_file,delimiter=',')
        load_number=0
        for row in csv_reader:
            if load_number==0:
                dataNames=row
            else:
                data.append(row)
            load_number+=1
            
    inputs=[data[i][0] for i in range(len(data))]
    outputs=[data[i][1] for i in range(len(data))]
    labelNames=list(set(outputs))
    
    return inputs,outputs,labelNames

def load_dataset(filename):
    import re
    data=[]
    dataNames=[]
    with open(filename) as file:
        row=file.readline()
        while row!='':
            row=row.split('>')
            row[1]=str(row[1].split('<')[0])
            row[0]=row[0].split('<')[1]
#             print(row)
            data.append(row)
            row=file.readline()

    inputs=[data[i][1] for i in range(len(data))]
    outputs=[data[i][0] for i in range(len(data))]
    labelNames=list(set(outputs))
    return inputs,outputs,labelNames

In [21]:
# load data
import os

# crtDir=os.getcwd()
# fileName=os.path.join(crtDir,'data','reviews_mixed.csv')
inputs,outputs,labelNames=load_data('../input/kmeans-diana/data/reviews_mixed.csv')

print(labelNames)


['negative', 'positive']


In [22]:
# split data
def split_data(inputs,outputs):
    np.random.seed(5)
    
    indexes=[i for i in range(len(inputs))]
    trainSample=np.random.choice(indexes,int(0.8*len(indexes)),replace=False)
    testSample=[i for i in indexes if i not in trainSample]
    
    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]
    testInputs = [inputs[i] for i in testSample]
    testOutputs = [outputs[i] for i in testSample]
    
    return trainInputs,trainOutputs,testInputs,testOutputs

trainInput,trainOutput,testInput,testOutput=split_data(inputs,outputs)        

In [23]:
#normalise the data using the 3 representations
# print(trainInput)
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_bag=CountVectorizer()

trainNormalisedInput_bag = vectorizer_bag.fit_transform(trainInput)
testNormalisedInput_bag=vectorizer_bag.transform(testInput)

#vocab
# print('vocab: ', vectorizer_bag.get_feature_names()[:5])
#features
# print('features: ' , testNormalisedInput_bag.toarray()[:5])

In [24]:
#Td-idf
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tf=TfidfVectorizer(max_features=150)

trainNormalisedInput_tf = vectorizer_tf.fit_transform(trainInput)
testNormalisedInput_tf=vectorizer_tf.transform(testInput)

#vocab=
# print('vocab: ', vectorizer_tf.get_feature_names()[:5])
#features
# print('features: ' , trainNormalisedInput_tf.toarray()[:5])

In [None]:
!pip install wget
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [31]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

word2vecmodel=KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)



In [32]:
def featureComputation(model, data):
    features = []
    phrases = [ phrase.split() for phrase in data]
    for phrase in phrases:
        # compute the embeddings of all the words from a phrase (words of more than 2 characters) known by the model
        vectors = [model[word] for word in phrase if (len(word) > 2) and (word in model.vocab.keys())]
        if len(vectors) == 0:
            result = [0.0] * model.vector_size
        else:
            result = np.sum(vectors, axis=0) / len(vectors)
        features.append(result)
    return features

trainNormalisedInput_word2vec = featureComputation(word2vecmodel, trainInput)
testNormalisedInput_word2vec = featureComputation(word2vecmodel, testInput)

# trainNormalisedInput_word2vec=hybrid_normalise(trainInput,trainNormalisedInput_word2vec)
# testNormalisedInput_word2vec=hybrid_normalise(trainInput,testNormalisedInput_word2vec)


In [33]:
def evalMLP(real_labels,computed_labels,label_names):
    from sklearn.metrics import confusion_matrix
    
    cm=confusion_matrix(real_labels,computed_labels)
    
    acc=sum([cm[i][i] for i in range(len(label_names))])/len(real_labels)
    prediction={}
    recall={}
    for i in range(len(label_names)):
        prediction[label_names[i]]=cm[i][i]/sum([cm[j][i] for j in range(len(label_names))])
        recall[label_names[i]]=cm[i][i]/sum(cm[i][j] for j in range(len(label_names)))
    
    return acc,prediction,recall,cm

def plot_confusion_matrix(cm,class_names,title):
    import itertools
    
    plt.figure(figsize=(12,8))
    plt.imshow(cm,interpolation='nearest', cmap='Blues')
    plt.title('The Confusion matrix of '+title)
    plt.colorbar()
    
    tick_marks=np.arange(len(class_names))
    plt.xticks(tick_marks,class_names,rotation=45)
    plt.yticks(tick_marks,class_names)
    
    #itertools face produse cartezian
    #daca ar fi itertools.product('ABCD',2)=['AA','AB','AC','AD'..etc]
    text_format='d'
    thresh=cm.max()/2
    for row,column in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
        plt.text(column,row,format(cm[row,column],text_format),horizontalalignment='center', color='white' if cm[row,column]>thresh else 'black')
        
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

    plt.show()

In [41]:
#train and test the data
import warnings
warnings.filterwarnings('ignore')

#supervised
from sklearn import neural_network
print('Supervised:')

#BoW
print('BoW')
supervised_classifier= neural_network.MLPClassifier(hidden_layer_sizes=(20,2),max_iter=1000,activation='relu',solver='sgd',random_state=1,verbose=0,learning_rate_init=0.1)
# X_flat=np.array(trainInput)
supervised_classifier.fit(trainNormalisedInput_bag,trainOutput)
predicted_labels=supervised_classifier.predict(testNormalisedInput_bag)
# print(predicted_labels)
acc,prediction,recall,cm= evalMLP(np.array(testOutput),predicted_labels,labelNames)

# plot_confusion_matrix(cm,labelNames,'Text classification [negative/positive]')

print('acc: ',acc)
print('prediction: ',prediction)
print('recall: ',recall)

# acc:  0.8571428571428571
# prediction:  {'negative': 0.8518518518518519, 'positive': 0.8666666666666667}
# recall:  {'negative': 0.92, 'positive': 0.7647058823529411}

print('Tf-idf')
supervised_classifier= neural_network.MLPClassifier(hidden_layer_sizes=(25,2),max_iter=1000,activation='relu',solver='sgd',random_state=1,verbose=0,learning_rate_init=0.1)
# X_flat=np.array(trainInput)
supervised_classifier.fit(trainNormalisedInput_tf,trainOutput)
predicted_labels=supervised_classifier.predict(testNormalisedInput_tf)
acc,prediction,recall,cm= evalMLP(np.array(testOutput),predicted_labels,labelNames)

# plot_confusion_matrix(cm,labelNames,'Text classification [negative/positive]')

print('acc: ',acc)
print('prediction: ',prediction)
print('recall: ',recall)

# acc:  0.8571428571428571
# prediction:  {'negative': 0.9130434782608695, 'positive': 0.7894736842105263}
# recall:  {'negative': 0.84, 'positive': 0.8823529411764706}

print('Word2vec pre-trained')
supervised_classifier= neural_network.MLPClassifier(hidden_layer_sizes=(25,2),max_iter=1000,activation='relu',solver='sgd',random_state=1,verbose=0,learning_rate_init=0.03)
# supervised_classifier= neural_network.MLPClassifier(hidden_layer_sizes=(25,2),max_iter=1000,activation='relu',solver='sgd',random_state=1,verbose=50,learning_rate_init=0.03)
# X_flat=np.array(trainInput)
supervised_classifier.fit(trainNormalisedInput_word2vec,trainOutput)
predicted_labels=supervised_classifier.predict(testNormalisedInput_word2vec)
acc,prediction,recall,cm= evalMLP(np.array(testOutput),predicted_labels,labelNames)

# plot_confusion_matrix(cm,labelNames,'Text classification [negative/positive]')

print('acc: ',acc)
print('prediction: ',prediction)
print('recall: ',recall)

# acc:  0.7857142857142857
# prediction:  {'negative': 0.8636363636363636, 'positive': 0.7}
# recall:  {'negative': 0.76, 'positive': 0.8235294117647058}

Supervised:
BoW
acc:  0.8571428571428571
prediction:  {'negative': 0.8518518518518519, 'positive': 0.8666666666666667}
recall:  {'negative': 0.92, 'positive': 0.7647058823529411}
Tf-idf
acc:  0.8571428571428571
prediction:  {'negative': 0.9130434782608695, 'positive': 0.7894736842105263}
recall:  {'negative': 0.84, 'positive': 0.8823529411764706}
Word2vec pre-trained
acc:  0.7857142857142857
prediction:  {'negative': 0.8636363636363636, 'positive': 0.7}
recall:  {'negative': 0.76, 'positive': 0.8235294117647058}


In [42]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

clusters=len(labelNames)
print('Unsupervised ')

print('Bag of words')
unsupervisedClassifier = KMeans(n_clusters=clusters, random_state=0)
unsupervisedClassifier.fit(trainNormalisedInput_bag)
computedTestIndexes = unsupervisedClassifier.predict(testNormalisedInput_bag)
computedTestOutputs = [labelNames[value] for value in computedTestIndexes]

print("acc: ", accuracy_score(testOutput, computedTestOutputs))

print('TF-IDF')
unsupervisedClassifier = KMeans(n_clusters=clusters, random_state=0)
unsupervisedClassifier.fit(trainNormalisedInput_tf)
computedTestIndexes = unsupervisedClassifier.predict(testNormalisedInput_tf)
computedTestOutputs = [labelNames[value] for value in computedTestIndexes]

print("acc: ", accuracy_score(testOutput, computedTestOutputs))

print('Word2vec pre-trained')
unsupervisedClassifier = KMeans(n_clusters=clusters, random_state=0)
unsupervisedClassifier.fit(trainNormalisedInput_word2vec)
computedTestIndexes = unsupervisedClassifier.predict(testNormalisedInput_word2vec)
computedTestOutputs = [labelNames[value] for value in computedTestIndexes]

print("acc: ", accuracy_score(testOutput, computedTestOutputs))

# Unsupervised 
# Bag of words
# acc:  0.5714285714285714
# TF-IDF
# acc:  0.23809523809523808
# Word2vec pre-trained
# acc:  0.40476190476190477

#Hybrid
# Unsupervised 
# Bag of words
# acc:  0.42857142857142855
# TF-IDF
# acc:  0.7619047619047619
# Word2vec pre-trained
# acc:  0.5952380952380952
    

Unsupervised 
Bag of words
acc:  0.42857142857142855
TF-IDF
acc:  0.6904761904761905
Word2vec pre-trained
acc:  0.35714285714285715
