In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import os
import re,nltk

In [2]:
dataset=pd.read_csv(r"C:\Users\ajaym\Downloads\twitter-airline-sentiment\Tweets2.csv")

In [3]:
list(dataset.columns.values)

['tweet_id', 'airline', 'text']

In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ",tweet) 
    tokens = nltk.word_tokenize(only_letters)[2:]
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [5]:
pd.set_option('display.max_colwidth', -1)
dataset['normalized_tweet'] = dataset.text.apply(normalizer)
dataset[['text','normalized_tweet']].head()

Unnamed: 0,text,normalized_tweet
0,@VirginAmerica What @dhepburn said.,"[dhepburn, said]"
1,@VirginAmerica plus you've added commercials to the experience... tacky.,"[added, commercial, experience, tacky]"
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[today, must, mean, need, take, another, trip]"
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[really, aggressive, blast, obnoxious, entertainment, guest, face, amp, little, recourse]"
4,@VirginAmerica and it's a really big bad thing about it,"[really, big, bad, thing]"


In [6]:
from nltk import ngrams
def ngrams(input_list):
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    return bigrams+trigrams
dataset['grams'] = dataset.normalized_tweet.apply(ngrams)
dataset[['grams']].head()

Unnamed: 0,grams
0,[dhepburn said]
1,"[added commercial, commercial experience, experience tacky, added commercial experience, commercial experience tacky]"
2,"[today must, must mean, mean need, need take, take another, another trip, today must mean, must mean need, mean need take, need take another, take another trip]"
3,"[really aggressive, aggressive blast, blast obnoxious, obnoxious entertainment, entertainment guest, guest face, face amp, amp little, little recourse, really aggressive blast, aggressive blast obnoxious, blast obnoxious entertainment, obnoxious entertainment guest, entertainment guest face, guest face amp, face amp little, amp little recourse]"
4,"[really big, big bad, bad thing, really big bad, big bad thing]"


In [7]:
pip install -U textblob 

Requirement already up-to-date: textblob in e:\anaconda3\lib\site-packages (0.15.3)
Note: you may need to restart the kernel to use updated packages.


In [7]:
from textblob import TextBlob, Word, Blobber

In [8]:
print(dataset.shape)
dataset.head()

(14640, 5)


Unnamed: 0,tweet_id,airline,text,normalized_tweet,grams
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.,"[dhepburn, said]",[dhepburn said]
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials to the experience... tacky.,"[added, commercial, experience, tacky]","[added commercial, commercial experience, experience tacky, added commercial experience, commercial experience tacky]"
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[today, must, mean, need, take, another, trip]","[today must, must mean, mean need, need take, take another, another trip, today must mean, must mean need, mean need take, need take another, take another trip]"
3,5.7e+17,Virgin America,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[really, aggressive, blast, obnoxious, entertainment, guest, face, amp, little, recourse]","[really aggressive, aggressive blast, blast obnoxious, obnoxious entertainment, entertainment guest, guest face, face amp, amp little, little recourse, really aggressive blast, aggressive blast obnoxious, blast obnoxious entertainment, obnoxious entertainment guest, entertainment guest face, guest face amp, face amp little, amp little recourse]"
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing about it,"[really, big, bad, thing]","[really big, big bad, bad thing, really big bad, big bad thing]"


In [9]:
dataset['text'][:10].apply(lambda x: TextBlob(x).sentiment)

0    (0.0, 0.0)                               
1    (0.0, 0.0)                               
2    (-0.390625, 0.6875)                      
3    (0.0062500000000000056, 0.35)            
4    (-0.3499999999999999, 0.3833333333333333)
5    (-0.2083333333333333, 0.6333333333333333)
6    (0.4666666666666666, 0.7666666666666666) 
7    (0.2, 0.2)                               
8    (1.0, 1.0)                               
9    (0.4666666666666666, 0.6)                
Name: text, dtype: object

In [10]:
dataset['sentiment_score'] = dataset['text'].apply(lambda x: TextBlob(x).sentiment[0])
dataset.head()

Unnamed: 0,tweet_id,airline,text,normalized_tweet,grams,sentiment_score
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.,"[dhepburn, said]",[dhepburn said],0.0
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials to the experience... tacky.,"[added, commercial, experience, tacky]","[added commercial, commercial experience, experience tacky, added commercial experience, commercial experience tacky]",0.0
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[today, must, mean, need, take, another, trip]","[today must, must mean, mean need, need take, take another, another trip, today must mean, must mean need, mean need take, need take another, take another trip]",-0.390625
3,5.7e+17,Virgin America,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[really, aggressive, blast, obnoxious, entertainment, guest, face, amp, little, recourse]","[really aggressive, aggressive blast, blast obnoxious, obnoxious entertainment, entertainment guest, guest face, face amp, amp little, little recourse, really aggressive blast, aggressive blast obnoxious, blast obnoxious entertainment, obnoxious entertainment guest, entertainment guest face, guest face amp, face amp little, amp little recourse]",0.00625
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing about it,"[really, big, bad, thing]","[really big, big bad, bad thing, really big bad, big bad thing]",-0.35


In [11]:
sentiment = pd.Series([]) 
for x in range(len(dataset)):
    if dataset["sentiment_score"][x] > 0.1 :
        sentiment[x] = "positive"
        
    elif dataset["sentiment_score"][x] < (-0.1) :
        sentiment[x] = "negative"
        
    else :
        sentiment[x] = "neutral"
        
dataset.insert(6,"Sentiment", sentiment)

In [12]:
dataset.head()

Unnamed: 0,tweet_id,airline,text,normalized_tweet,grams,sentiment_score,Sentiment
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.,"[dhepburn, said]",[dhepburn said],0.0,neutral
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials to the experience... tacky.,"[added, commercial, experience, tacky]","[added commercial, commercial experience, experience tacky, added commercial experience, commercial experience tacky]",0.0,neutral
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[today, must, mean, need, take, another, trip]","[today must, must mean, mean need, need take, take another, another trip, today must mean, must mean need, mean need take, need take another, take another trip]",-0.390625,negative
3,5.7e+17,Virgin America,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[really, aggressive, blast, obnoxious, entertainment, guest, face, amp, little, recourse]","[really aggressive, aggressive blast, blast obnoxious, obnoxious entertainment, entertainment guest, guest face, face amp, amp little, little recourse, really aggressive blast, aggressive blast obnoxious, blast obnoxious entertainment, obnoxious entertainment guest, entertainment guest face, guest face amp, face amp little, amp little recourse]",0.00625,neutral
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing about it,"[really, big, bad, thing]","[really big, big bad, bad thing, really big bad, big bad thing]",-0.35,negative


In [13]:
import collections
def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt

In [14]:
dataset[['grams']].apply(count_words)['grams'].most_common(20)

[('http co', 1205),
 ('customer service', 549),
 ('cancelled flightled', 484),
 ('late flight', 242),
 ('cancelled flighted', 217),
 ('flight cancelled', 201),
 ('late flightr', 156),
 ('cancelled flight', 154),
 ('booking problem', 148),
 ('fleet fleek', 146),
 ('fleek http', 144),
 ('fleet fleek http', 144),
 ('fleek http co', 144),
 ('flightled flight', 131),
 ('flight cancelled flightled', 129),
 ('hold hour', 129),
 ('flight delayed', 122),
 ('cancelled flightled flight', 115),
 ('call back', 106),
 ('gate agent', 101)]

In [15]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,2))

In [16]:
vectorized_data = count_vectorizer.fit_transform(dataset.text)

In [17]:
def sentiment2target(sentiment):
    return {
        'negative': 0,
        'neutral': 1,
        'positive' : 2
    }[sentiment]
targets = dataset.Sentiment.apply(sentiment2target)

In [18]:
targets[:5]

0    1
1    1
2    0
3    1
4    0
Name: Sentiment, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(vectorized_data,targets, test_size = 0.25, random_state = 1)
X_train=X_train[:,[3,5]]
X_test=X_test[:,[3,5]]

In [20]:
X_train

<10980x2 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn import svm
svc = svm.SVC(kernel = 'rbf') 
svc.fit(X_train,Y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [22]:
Y_pred = svc.predict(X_test)

In [23]:
Y_train.head()

1041     2
1989     0
14309    0
1711     0
11542    2
Name: Sentiment, dtype: int64

In [24]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test,Y_pred)
accuracy = float(cm.diagonal().sum())/len(Y_test)
print("\nAccuracy Of SVM For The Given Dataset : ", accuracy*100)


Accuracy Of SVM For The Given Dataset :  48.63387978142077
