# POS embedings


In [9]:
# Libraries import
import pandas as pd
import numpy as np
import re # For regular expressions
import nltk

pd.set_option('max_colwidth',80) # set max column width

In [10]:
dataset = pd.read_csv("TrainingData.csv", sep=",", header=None, skipfooter=1, engine="python")
dataset.columns = ["id","Review","Sugg_Class"]
dataset.shape


(8052, 3)

In [11]:
##### Submission data
test_data_for_subtaskA_predictions = pd.read_csv("TrialData_SubtaskA_Test.csv",header = None, sep=",", skipfooter=1, engine="python")



In [12]:
## Modeling data
modeling_data = dataset.append(test_data_for_subtaskA_predictions)
modeling_data.shape

  result = result.union(other)


(8643, 6)

In [15]:
# This function converts a text to a sequence of words.
def review_wordlist(review, remove_stopwords=False):
    # 2. Removing non-letter
    review_text = re.sub("[^a-zA-Z]"," ",review)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    ## POS Tagging
    POS = nltk.pos_tag(tokens=words)
    POS_cat = []
    for pair in POS:
        POS_cat.append(pair[1]) 
          
    return(POS_cat)

In [16]:
review_wordlist("Ola mundo")

['NNS', 'NN']

In [17]:
# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [18]:
# This function splits a review into sentences
def review_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Using nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # This returns the list of lists
    return sentences

In [19]:
sentences = []
print("Parsing sentences from training set")
for review in modeling_data["Review"]:
    sentences += review_sentences(review, tokenizer)


Parsing sentences from training set


AttributeError: 'float' object has no attribute 'strip'

In [None]:
sentences[0]

In [201]:
## POS Tagging
sentences_POS = []
for sentence in sentences:
    
    POS = nltk.pos_tag(tokens=sentence)
   
    POS_cat = []
    for pair in POS:
        POS_cat.append((pair[0] + "/" + pair[1]))
    
    sentences_POS.append(POS_cat)

In [202]:
sentences_POS[0]

['please/NN/NN',
 'enable/JJ/NN',
 'removing/VBG/NN',
 'language/NN/NN',
 'code/NN/NN',
 'from/IN/NN',
 'the/DT/NN',
 'dev/NN/NN',
 'center/NN/NN',
 'language/NN/NN',
 'history/NN/NN',
 'for/IN/NN',
 'example/NN/NN',
 'if/IN/NN',
 'you/PRP/NN',
 'ever/RB/NN',
 'selected/VBN/NN',
 'ru/NN/NN',
 'and/CC/NN',
 'ru/NN/NN',
 'ru/NN/NN',
 'laguages/NNS/NN',
 'and/CC/NN',
 'you/PRP/NN',
 'published/VBP/NN',
 'this/DT/NN',
 'xap/NN/NNP',
 'to/TO/NN',
 'the/DT/NN',
 'store/NN/NN',
 'then/RB/NN',
 'it/PRP/NN',
 'causes/VBZ/NN',
 'tile/JJ/NN',
 'localization/NN/NN',
 'to/TO/NN',
 'show/VB/NN',
 'the/DT/NN',
 'en/FW/NN',
 'us/PRP/JJ',
 'default/VBP/NN',
 'tile/JJ/NN',
 'localization/NN/NN',
 'which/WDT/NN',
 'is/VBZ/NN',
 'bad/JJ/NN']

In [None]:
len(sentences)

In [None]:
# Importing the built-in logging module
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Creating the model and setting values for the various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 10 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences_POS,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "joinedTrainANDtestdataset"
model.save(model_name)

In [None]:
# This will print the most similar words present in the model
model.wv.most_similar("enable/JJ/NN")

In [None]:
# This will give the total number of words in the vocabolary created from this dataset
model.wv.vectors.shape

In [None]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [None]:
#Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [None]:
# Calculating average feature vector for training set
clean_train_reviews = []
for review in dataset['Review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=False))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

In [None]:
# Fitting a random forest classifier to the training data
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)

In [None]:
clean_train_reviews[0]

In [None]:
y = dataset.iloc[:,2].values
 # Splitting the dataset into the Training set and Test set
    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainDataVecs, y, test_size = 0.2, random_state = 0)   


In [None]:
# remove nan values
X_train = np.nan_to_num(X_train)
y_train = np.nan_to_num(y_train)
X_test = np.nan_to_num(X_test)
y_test = np.nan_to_num(y_test)


np.any(np.isnan(X_train))


In [None]:
print("Fitting random forest to training data....")    
forest = forest.fit(X_train, y_train)


In [None]:
# Predicting the Test set results
y_pred = forest.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
# k fold validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

accuracy = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=3, scoring="accuracy")
precision = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=3, scoring="precision")
recall = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=3, scoring="recall")
f1 = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=3, scoring="f1")

results = {"accuracy" :accuracy, "precision" :precision, "recall": recall, "f1" : f1 }
results

In [189]:
cm

array([[1203,    0],
       [ 408,    0]])

In [None]:
##### Submission dataset
# Calculating average feature vector for training set
clean_submission_reviews = []
for review in test_data_for_subtaskA_predictions[1]:
    clean_submission_reviews.append(review_wordlist(review, remove_stopwords=False))
    
submissionDataVecs = getAvgFeatureVecs(clean_submission_reviews, model, num_features)

# remove nan values
submissionDataVecs = np.nan_to_num(submissionDataVecs)

np.any(np.isnan(submissionDataVecs))


In [79]:
# Predicting the sentiment values for test data and saving the results in a csv file 
result = forest.predict(submissionDataVecs)
submission= pd.DataFrame(data={1:test_data_for_subtaskA_predictions[0],2:test_data_for_subtaskA_predictions[1], 3:result})



Unnamed: 0,1,2,3
0,13101,"""I'm not asking Microsoft to Gives permission like Android so any app can ta...",0
1,13121,"""somewhere between Android and iPhone.""",0
2,13131,"""And in the Windows Store you can flag the App [Requires Trust] for example.""",0
3,13132,"""Many thanks Sameh Hi, As we know, there is a lot of limitations is WP8 OS d...",0
4,13133,"""The idea is that we can develop a regular app and we request our permission...",0
5,13161,"""Please add this simple and extremely helpful feature.""",1
6,13182,"""Why not let us have several pages that we can put tiles on and name whateve...",0
7,13191,"""The Idea is if user running our app and by any reason he rebooted the phone...",0
8,13192,"""Add support for the biometric framework from windows 7/8 to allow hardware ...",1
9,13196,"""It would be more natural to allow the physical search key to activate the s...",1


In [87]:
# SVM
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [88]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [90]:
# k fold validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

accuracy = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=5, scoring="accuracy")
precision = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=5, scoring="precision")
recall = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=5, scoring="recall")
f1 = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=5, scoring="f1")

results = {"accuracy" :accuracy, "precision" :precision, "recall": recall, "f1" : f1 }
results

{'accuracy': array([0.82777347, 0.83242824, 0.82001552, 0.81973582, 0.82439782]),
 'precision': array([0.715     , 0.74178404, 0.72164948, 0.72115385, 0.72769953]),
 'recall': array([0.44936709, 0.49683544, 0.4335443 , 0.4952381 , 0.46666667]),
 'f1': array([0.56031128, 0.58691589, 0.5234375 , 0.56133829, 0.58691589])}

In [80]:
submission.to_csv( "submission.csv", index=False, quoting=3 , escapechar=",")

In [81]:
submission

Unnamed: 0,1,2,3
0,13101,"""I'm not asking Microsoft to Gives permission like Android so any app can ta...",0
1,13121,"""somewhere between Android and iPhone.""",0
2,13131,"""And in the Windows Store you can flag the App [Requires Trust] for example.""",0
3,13132,"""Many thanks Sameh Hi, As we know, there is a lot of limitations is WP8 OS d...",0
4,13133,"""The idea is that we can develop a regular app and we request our permission...",0
5,13161,"""Please add this simple and extremely helpful feature.""",1
6,13182,"""Why not let us have several pages that we can put tiles on and name whateve...",0
7,13191,"""The Idea is if user running our app and by any reason he rebooted the phone...",0
8,13192,"""Add support for the biometric framework from windows 7/8 to allow hardware ...",1
9,13196,"""It would be more natural to allow the physical search key to activate the s...",1


NameError: name 'filename' is not defined