In [9]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re # For regular expressions
from sklearn.manifold import TSNE

In [12]:
dataset = pd.read_csv("TrainingData.csv", sep=",", header=None, skipfooter=1, engine="python")
dataset.columns = ["id","Review","Sugg_Class"]


In [13]:
# This function converts a text to a sequence of words.
def review_wordlist(review, remove_stopwords=False):
    # 2. Removing non-letter
    review_text = re.sub("[^a-zA-Z]"," ",review)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [14]:
# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [15]:
# This function splits a review into sentences
def review_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Using nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # This returns the list of lists
    return sentences

In [16]:
sentences = []
print("Parsing sentences from training set")
for review in dataset["Review"]:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set


In [7]:
embeddings_index = dict()
f = open('glove.6B/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [34]:
len(sentences)

8307

In [38]:
WordsList = []
for i in range(len(sentences)):
    WordsList = WordsList + sentences[i]
    
len(WordsList)

147095

In [39]:
len(set(WordsList))

8714

In [8]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.keys())
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [9]:
#Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [10]:
# Calculating average feature vector for training set
clean_train_reviews = []
for review in dataset['Review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=False))
    


In [11]:
num_features = 300  # Word vector dimensionality
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, embeddings_index, num_features)

Review 0 of 8052


  app.launch_new_instance()


Review 1000 of 8052
Review 2000 of 8052
Review 3000 of 8052
Review 4000 of 8052
Review 5000 of 8052
Review 6000 of 8052
Review 7000 of 8052
Review 8000 of 8052


In [12]:
# Fitting a random forest classifier to the training data
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)

In [13]:
y = dataset.iloc[:,2].values
 # Splitting the dataset into the Training set and Test set
    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainDataVecs, y, test_size = 0.2, random_state = 0)   



In [14]:
# remove nan values
X_train = np.nan_to_num(X_train)
y_train = np.nan_to_num(y_train)
X_test = np.nan_to_num(X_test)
y_test = np.nan_to_num(y_test)


np.any(np.isnan(X_train))

False

In [15]:
print("Fitting random forest to training data....")    
forest = forest.fit(X_train, y_train)


Fitting random forest to training data....


In [16]:
# k fold validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

accuracy = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=3, scoring="accuracy")
precision = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=3, scoring="precision")
recall = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=3, scoring="recall")
f1 = cross_val_score(estimator=forest, X=X_train,y=y_train, cv=3, scoring="f1")

results = {"accuracy" :accuracy, "precision" :precision, "recall": recall, "f1" : f1 }
results

{'accuracy': array([0.80950163, 0.80158361, 0.80763857]),
 'precision': array([0.88965517, 0.83225806, 0.81012658]),
 'recall': array([0.23003802, 0.25285171, 0.27376426]),
 'f1': array([0.38208955, 0.35465116, 0.3960114 ])}

In [17]:
# Predicting the Test set results
y_pred = forest.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1179,   24],
       [ 288,  120]])

In [43]:
## Network architecture
model = Sequential()
model.add(Embedding(8307, 100, input_length=50))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [44]:
sentences

[['please',
  'enable',
  'removing',
  'language',
  'code',
  'from',
  'the',
  'dev',
  'center',
  'language',
  'history',
  'for',
  'example',
  'if',
  'you',
  'ever',
  'selected',
  'ru',
  'and',
  'ru',
  'ru',
  'laguages',
  'and',
  'you',
  'published',
  'this',
  'xap',
  'to',
  'the',
  'store',
  'then',
  'it',
  'causes',
  'tile',
  'localization',
  'to',
  'show',
  'the',
  'en',
  'us',
  'default',
  'tile',
  'localization',
  'which',
  'is',
  'bad'],
 ['note',
  'in',
  'your',
  'csproj',
  'file',
  'there',
  'is',
  'a',
  'supportedcultures',
  'entry',
  'like',
  'this',
  'supportedcultures',
  'de',
  'de',
  'ru',
  'ru',
  'ru',
  'supportedcultures',
  'when',
  'i',
  'removed',
  'the',
  'ru',
  'language',
  'code',
  'and',
  'published',
  'my',
  'new',
  'xap',
  'version',
  'the',
  'old',
  'xap',
  'version',
  'still',
  'remains',
  'in',
  'the',
  'store',
  'with',
  'replaced',
  'and',
  'unpublished'],
 ['wich',
  'mean

In [54]:
## Fit the model
data = pad_sequences(sentences, maxlen=100, truncating="pre")


ValueError: invalid literal for int() with base 10: 'please'

In [52]:
model.fit(data, np.array(dataset["Sugg_Class"]) , validation_split=0.4, epochs=3)

NameError: name 'data' is not defined