# Data load

In [106]:
# Libraries import
import pandas as pd
import numpy as np

pd.set_option('max_colwidth',80) # set max column width

In [107]:
dataset = pd.read_csv("TrainingData.csv",sep=",",header=None,skipfooter=1, engine="python")
dataset.columns = ["id","Review","Sugg_Class"]
dataset = dataset.iloc[:,1:]

In [108]:
dataset

Unnamed: 0,Review,Sugg_Class
0,"""Please enable removing language code from the Dev Center ""language history""...",1
1,"""Note: in your .csproj file, there is a SupportedCultures entry like this: <...",0
2,"""Wich means the new version not fully replaced the old version and this caus...",0
3,"""Some of my users will still receive the old xap version of my app.""",0
4,"""The store randomly gives the old xap or the new xap version of my app.""",0
5,"""My app has a WP7 version and a WP8 version XAP in the same submission.""",0
6,"""The wp7 xap works only on WP7 and the wp8 xap works only for WP8.""",0
7,"""Sometimes the Store gives the wrong wp7 xap version of my app to Windows Ph...",0
8,"""It should be an option to remove the ""ru"" language code from my app 'langua...",1
9,"""Currently if you ever mistakenly selected a ""ru"" language than you will be ...",0


In [134]:
(sum(dataset["Sugg_Class"]) / len(dataset)) * 100


24.66467958271237

In [109]:
## Clean text -----------------------------------------------------------------
import re
import nltk 
from nltk.corpus import stopwords
nltk.download("stopwords") # downloads or updates stopword list
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tiagocabo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [110]:
corpus = []

for i in range(len(dataset)):
    # keep only letters
    review = re.sub('[^a-zA-Z]'," ",dataset["Review"][i])

    # Ensure every word lowercase
    review = review.lower()

    # remove stopwords & stemming (keeps the word's root

    review = review.split()

    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words("english"))]

    review = ' '.join(review) 
        
    corpus.append(review)

In [111]:
corpus

['pleas enabl remov languag code dev center languag histori exampl ever select ru ru ru laguag publish xap store caus tile local show en us default tile local bad',
 'note csproj file supportedcultur entri like supportedcultur de de ru ru ru supportedcultur remov ru languag code publish new xap version old xap version still remain store replac unpublish',
 'wich mean new version fulli replac old version caus seriou problem',
 'user still receiv old xap version app',
 'store randomli give old xap new xap version app',
 'app wp version wp version xap submiss',
 'wp xap work wp wp xap work wp',
 'sometim store give wrong wp xap version app window phone user',
 'option remov ru languag code app languag histori option completli replac fulli replac prevoiu es xap',
 'current ever mistakenli select ru languag bad situat forev live wrong tile display russian languag store english one publish updat remov ru languag updat fulli replac old version trubl',
 'store randomli deliv old wrong version 

In [112]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values

In [113]:
y

array([1, 0, 0, ..., 1, 1, 1])

In [114]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [115]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [116]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [117]:
cm

array([[532, 671],
       [115, 293]])

In [120]:
(532+293) / (532+671+115+293)

0.5121042830540037

In [135]:
# Precision
Precision = 532 / (532 + 671)
Precision


0.44222776392352453

In [136]:
# Recall
Recall = 532 / (532 + 115)
Recall

0.8222565687789799

In [138]:
# F1
F1 = 2 * (Precision * Recall) / (Precision + Recall)
F1

0.5751351351351351