### Step 1: import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn import svm

  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as libops
  from pandas._libs.interval import (
  from pandas._libs import internals as libinternals
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import (lib, reduction,
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from pandas._libs import algos, lib, writers as libwriters
  from .murmurhash import murmurhash3_32
  from . import libsvm, liblinear
  from . import libsvm_sparse
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .sparsefuncs_fast import csr_row_norms
  from ..utils.seq_dataset im

### Step 2: Load the data

In [2]:
df = pd.read_excel('modified dataset for sentiment analysis.xlsx',header=0, delimiter="\t", quoting=3)
df.dropna(inplace=True)
print df.shape
df.head(10)

(1105, 2)


Unnamed: 0,target,Feedback
0,-1.0,bad
1,1.0,Good
2,1.0,Excellent lectures are delivered by teachers a...
3,1.0,Good
4,1.0,teachers give us all the information required ...
5,1.0,Yes
6,-1.0,our college aptitude class was very bored and ...
7,1.0,It is good
8,0.0,waste of time
9,1.0,Good


### Step 3: split the loaded data has a train data and test data

In [3]:
from sklearn.model_selection import train_test_split
df["TARGET"]=np.where(df['target']>0,1,-1)
X_train, X_test, Y_train, Y_test = train_test_split(df['Feedback'],df['target'],random_state=0)

In [4]:
#X = df.iloc[ : , 1:-10]
#Y = df.iloc[:, 0:1].values

In [5]:
#from sklearn.cross_validation import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0)


### Step 4: count repeated words in the training set

In [6]:
from collections import Counter

count_vocab = Counter()
for txt in X_train:
    for word in txt.split(' '):
        count_vocab[word] += 1
        
count_vocab.most_common(10)

[(u'good', 337),
 (u'is', 251),
 (u'the', 180),
 (u'are', 149),
 (u'and', 134),
 (u'of', 131),
 (u'', 121),
 (u'to', 108),
 (u'in', 102),
 (u'not', 93)]

## Step 5: stopwords
A stop word is a commonly used word (such as “the”, “a”, “an”, “in”)that a search engine has been programmed to ignore.
#### Our Input here:
['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
#### Our output here:
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']

In [7]:
import nltk
nltk.download('stopwords')

  from . import _hashing
  from ._svmlight_format import _load_svmlight_file


[nltk_data] Downloading package stopwords to /home/asus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

vocab_reduced = Counter()
for w, c in count_vocab.items():
    if not w in stop:
        vocab_reduced[w] = c
vocab_reduced.most_common(14)

[(u'good', 337),
 (u'', 121),
 (u'Good', 93),
 (u'university', 39),
 (u'students', 36),
 (u'excellent', 32),
 (u'books', 31),
 (u'library', 29),
 (u'good.', 28),
 (u'course', 27),
 (u'teaching', 25),
 (u'time', 25),
 (u'pattern', 24),
 (u'teachers', 23)]

### Step 6: re - regular expression 
    A regular expression (or RE) specifies a set of strings that matches it; the functions in this module let you check if a particular string matches a given regular expression (or if a given regular expression matches a particular string, which comes down to the same thing).

In [9]:
import re

def preprocessor(text):
    
    #remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    return text

print(preprocessor('This!! twit man :) is <b>nice</b>'))

this twit man is nice :)


### Step 7: PorterStemmer
The idea of stemming is a sort of normalizing method. Many variations of words carry the same meaning, other than when tense is involved. 

#####  The reason why we stem is to shorten the lookup, and normalize sentences.
#### Example 1:
I was taking a ride in the car.

I was riding in the car.
#### Example 2:
loving - love

love - loving

In [10]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

print(tokenizer('Hi there, I am loving this, like with a lot of love'))
print(tokenizer_porter('Hi there, I am loving this, like with a lot of love'))

['Hi', 'there,', 'I', 'am', 'loving', 'this,', 'like', 'with', 'a', 'lot', 'of', 'love']
['Hi', 'there,', 'I', 'am', u'love', 'this,', 'like', 'with', 'a', 'lot', 'of', 'love']


### Step 8: Pipelining of models

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 9)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 9)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__C': [1.0, 10.0, 100.0]},
              ]
lr_tfidf = Pipeline([('vect', tfidf),('clf', LinearSVC(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv = 5,
                           verbose=1,
                           n_jobs=-1)

### Step 9: training our model with dataset

In [12]:
gs_lr_tfidf.fit(X_train, Y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru..., max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 9)], 'vect__tokenizer': [<function tokenizer at 0x7fae9fdcc938>, <function tokenizer_porter at 0x7fae9fdcca28>], 'vect__preprocessor': [None, <function preprocessor at 0x7faea3e38f50>], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'm...ction tokenizer_porter at 0x7fae9fdcca28>], 'vect__use_idf': [False], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_tr

### Step 10: Finding a Best papameter and Best accuracy of trained model

In [13]:
print('Best parameter set: ' + str(gs_lr_tfidf.best_params_))
print('Best accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'vect__ngram_range': (1, 9), 'vect__tokenizer': <function tokenizer at 0x7fae9fdcc938>, 'vect__preprocessor': <function preprocessor at 0x7faea3e38f50>, 'clf__C': 10.0, 'vect__stop_words': None}
Best accuracy: 0.866


### Step 11: Accuracy of model prediction

In [14]:
clf = gs_lr_tfidf.best_estimator_
print('Accuracy in test: %.3f' % clf.score(X_test, Y_test))

Accuracy in test: 0.856


### Step 12: manual testing

In [15]:
twits = []
user_feedback = raw_input("give ur feedback: ")
twits.append(user_feedback)

preds = clf.predict(twits)

for i in range(len(twits)):
    #print(preds[i], twits[i])
    if preds[i] == -1:
        print "Feedback Negative: {BAD}= ",(preds[i])
    elif preds[i] == 1:
        print "Feedback Positive: {GOOD}=",(preds[i])
    else:
        print "Feedback neutral: {AVERAGE}=",(preds[i])

give ur feedback: that class was really good
Feedback Positive: {GOOD}= 1.0


# Done 😍😍😍😍