In [0]:
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn import utils
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import KFold
import re
from collections import OrderedDict
from sklearn.model_selection import GridSearchCV 

import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('perluniprops')

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.moses import MosesDetokenizer
from nltk.tokenize import RegexpTokenizer

from sklearn.metrics import confusion_matrix


#for google colab

from google.colab import drive
drive.mount('/content/drive')


def readData(filepath):
  ''' 
  a method that gets a filepath and returns data,targets
  '''
  raw_data=[]
  with open(filepath) as fp:
    for cnt, line in enumerate(fp):

        # let's read line by line 
        #print("Line {}: {}".format(cnt, line))
        raw_data.append(re.split(r'\t', line))
  
  data=[]
  targets=[]
  for row, target in raw_data:

    # now we seperate the data from the targets
    data.append(row)
    targets.append(int(target))
    #print("row : {}  class : {}".format(row,target))
  
  # return the data and the targets in dic

  return {'data':data,'targets':targets}

yelp=readData('/content/drive/My Drive/google_colab/reviews/yelp_labelled.txt')
imdb=readData('/content/drive/My Drive/google_colab/reviews/imdb_labelled.txt')
amazon=readData("/content/drive/My Drive/google_colab/reviews/amazon_cells_labelled.txt")


# we make one list from the 3 datasets
X=yelp['data']+imdb['data']+amazon['data']
y=yelp['targets']+imdb['targets']+amazon['targets']
'''
X=imdb['data']
y=imdb['targets']

X=X[0:500]
y=y[0:500]
'''
data={'Sentence':X,'Class':y}

# we create a dataframe
df = pd.DataFrame(data, columns = ['Sentence', 'Class'])

# we remove the duplicates 
numOfnoDup = df.drop_duplicates(keep='first',inplace=False)
print('No of duplicate rows that are dropped:',len(df)-len(numOfnoDup))

# a list with stop words
stop_words = set(stopwords.words('english'))

porter = PorterStemmer()
# a regexp to removing removing punctuation
tokenizer = RegexpTokenizer(r'\w+')

new_sentence=[]
all_words=[]

for ind in df.index:
  #we take each row 
  row=df['Sentence'][ind]
  #we split the row to words and take only the words from the sentence
  token=tokenizer.tokenize(row)
  #also remove all stop words
  token=[w for w in token if not w in stop_words]
  #then we stem each word
  token=[porter.stem(w) for w in token]
  for w in token:
    all_words.append(w)
  #we create the stem sentence from the token list
  detokens = MosesDetokenizer().detokenize(token, return_str=True)
  new_sentence.append(detokens)



#print(len(all_words))
unique_words=list(OrderedDict.fromkeys(all_words))
#print(len(unique_words))
#print("the unique words are : {}".format(len(unique_words.sort())))


#we create a new colum named stem_Sentence
df['stem_Sentence']=pd.Series(new_sentence, index=df.index)

vector_Sentence=[]


for i in df.index:
  row=df['stem_Sentence'][i]
  temp=[]
  for x in unique_words:
    #print(x)
    num=row.count(x)
    #print(unique_words.count(x))
    temp.append(num)
# we create feature vector
  vector_Sentence.append(temp)

print(len(df['Class']))

'''
for sentence in vector_Sentence:
  print(sentence)
'''

#print(df['stem_Sentence'])

#split the data to train and test set
X_train, X_test, y_train, y_test = train_test_split(vector_Sentence,df['Class'], test_size=0.3)
'''
# let's create a create for
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2,1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

grid = GridSearchCV(svm.SVC(), tuned_parameters)

grid.fit(X_train,y_train)

# print best parameter after tuning 
print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_)

'''
model= svm.SVC(C=1000,kernel='rbf',gamma=0.0001)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

tn, fp, fn, tp =  confusion_matrix(y_test,y_pred).ravel()

print("tn:{} fp:{} fn:{} tp:{}".format(tn,fp,fn,tp))

accuracy=((tn+tp)/(tn+tp+fn+fp))
precision=(tp/(tp+fp))
recall=(tp/(tp+fn))
fmeasure=(((precision*recall*2)/((precision+recall))))
sensitivity=(tp/(tp+fn))
specificity=(tn/(tn+fp))


print("Test set")
print("Mean Accuracy :",accuracy)
print("Mean Precision :",precision)
print("Mean recall :",recall)   
print("Mean Fmeasure :",fmeasure)
print("Mean Sensitivity :",sensitivity)
print("Mean Specificity :",specificity)

