# 05. Working with text data

## 02. Pandas string functions 

In [39]:
import pandas as pd
import numpy as np

s=pd.Series(['0', 'John Wood', 'Colin Welsh', 'my list', '02456', np.nan, 'HELLO WORLD', 'water%'])

In [40]:
s.str.lower()

0              0
1      john wood
2    colin welsh
3        my list
4          02456
5            NaN
6    hello world
7         water%
dtype: object

In [41]:
s.str.upper()

0              0
1      JOHN WOOD
2    COLIN WELSH
3        MY LIST
4          02456
5            NaN
6    HELLO WORLD
7         WATER%
dtype: object

In [42]:
s.str.len()

0     1.0
1     9.0
2    11.0
3     7.0
4     5.0
5     NaN
6    11.0
7     6.0
dtype: float64

In [43]:
s.str.split(' ')

0               [0]
1      [John, Wood]
2    [Colin, Welsh]
3        [my, list]
4           [02456]
5               NaN
6    [HELLO, WORLD]
7          [water%]
dtype: object

In [56]:
substrings = s.str.split(' ', expand=True)
substrings

Unnamed: 0,0,1
0,0,
1,John,Wood
2,Colin,Welsh
3,my,list
4,02456,
5,,
6,HELLO,WORLD
7,water%,


In [57]:
s.str.replace('%',' percent ')

0                 0
1         John Wood
2       Colin Welsh
3           my list
4             02456
5               NaN
6       HELLO WORLD
7    water percent 
dtype: object

In [58]:
s.str[0:2]

0      0
1     Jo
2     Co
3     my
4     02
5    NaN
6     HE
7     wa
dtype: object

In [59]:
    s.str.slice_replace(0,2, '___')

0             ___
1      ___hn Wood
2    ___lin Welsh
3        ___ list
4          ___456
5             NaN
6    ___LLO WORLD
7         ___ter%
dtype: object

In [61]:
flag = s.str.contains('0', na=False)
flag

0     True
1    False
2    False
3    False
4     True
5    False
6    False
7    False
dtype: bool

In [45]:
s[flag]

0        0
4    02456
dtype: object

## 03. Example: cleaning up the movies dataset

In [46]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [47]:
genres = movies.genres
genres.head()

0    [{"id": 28, "name": "Action"}, {"id": 12, "nam...
1    [{"id": 12, "name": "Adventure"}, {"id": 14, "...
2    [{"id": 28, "name": "Action"}, {"id": 12, "nam...
3    [{"id": 28, "name": "Action"}, {"id": 80, "nam...
4    [{"id": 28, "name": "Action"}, {"id": 12, "nam...
Name: genres, dtype: object

In [48]:
import json

json_obj = json.loads(genres[0]) # Load json string
names = [x['name'] for x in json_obj] # ['Action', 'Adventure', 'Fantasy', 'Science Fiction']
', '.join(names) # 'Action, Adventure, Fantasy, Science Fiction'

'Action, Adventure, Fantasy, Science Fiction'

In [49]:
def transform(s):
    s=s.str.strip('[]')
    s=s.str.replace('{','')
    s=s.str.replace('}','')
    s=s.str.replace(',','')
    s=s.str.replace('\"id\":','')
    s=s.str.replace('\"name\":','')
    s=s.str.replace('"','')
    s=s.str.replace('0','')
    s=s.str.replace('1','')
    s=s.str.replace('2','')
    s=s.str.replace('3','')
    s=s.str.replace('4','')
    s=s.str.replace('5','')
    s=s.str.replace('6','')
    s=s.str.replace('7','')
    s=s.str.replace('8','')
    s=s.str.replace('9','')
    s=s.str.replace('    ',', ')
    s=s.str.replace('   ','')
    return(s)

In [50]:
genres = transform(genres)  
genres.head()

0    Action, Adventure, Fantasy, Science Fiction
1                     Adventure, Fantasy, Action
2                       Action, Adventure, Crime
3                 Action, Crime, Drama, Thriller
4             Action, Adventure, Science Fiction
Name: genres, dtype: object

In [52]:
movies.genres=genres
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"Action, Adventure, Fantasy, Science Fiction",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"Adventure, Fantasy, Action",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"Action, Adventure, Crime",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"Action, Crime, Drama, Thriller",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"Action, Adventure, Science Fiction",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


## 04. Exercise: further practice with the movies dataset
same for the keywords column

In [54]:
keywords = movies.keywords
keywords.head()

0    [{"id": 1463, "name": "culture clash"}, {"id":...
1    [{"id": 270, "name": "ocean"}, {"id": 726, "na...
2    [{"id": 470, "name": "spy"}, {"id": 818, "name...
3    [{"id": 849, "name": "dc comics"}, {"id": 853,...
4    [{"id": 818, "name": "based on novel"}, {"id":...
Name: keywords, dtype: object

In [63]:
keywords=transform(keywords)
keywords.head()

0    culture clash, future, space war, space colony...
1    ocean, drug abuse, exotic island, east india t...
2    spy, based on novel, secret agent, sequel, mi,...
3    dc comics, crime fighter, terrorist, secret id...
4    based on novel, mars, medallion, space travel,...
Name: keywords, dtype: object

In [66]:
keywords_df = keywords.str.split(',', expand=True)
keywords_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,culture clash,future,space war,space colony,society,space travel,futuristic,romance,space,alien,...,,,,,,,,,,
1,ocean,drug abuse,exotic island,east india trading company,love of one's life,traitor,shipwreck,strong woman,ship,alliance,...,,,,,,,,,,
2,spy,based on novel,secret agent,sequel,mi,british secret service,united kingdom,,,,...,,,,,,,,,,
3,dc comics,crime fighter,terrorist,secret identity,burglar,hostage drama,time bomb,gotham city,vigilante,cover-up,...,,,,,,,,,,
4,based on novel,mars,medallion,space travel,princess,alien,steampunk,martian,escape,edgar rice burroughs,...,,,,,,,,,,


In [69]:
keywords =  keywords_df[0]+', '+keywords_df[1]+', '+keywords_df[2]
keywords.head()

0       culture clash,  future,  space war
1       ocean,  drug abuse,  exotic island
2      spy,  based on novel,  secret agent
3    dc comics,  crime fighter,  terrorist
4        based on novel,  mars,  medallion
dtype: object

In [70]:
movies.keywords = keywords
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"Action, Adventure, Fantasy, Science Fiction",http://www.avatarmovie.com/,19995,"culture clash, future, space war",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"Adventure, Fantasy, Action",http://disney.go.com/disneypictures/pirates/,285,"ocean, drug abuse, exotic island",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"Action, Adventure, Crime",http://www.sonypictures.com/movies/spectre/,206647,"spy, based on novel, secret agent",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"Action, Crime, Drama, Thriller",http://www.thedarkknightrises.com/,49026,"dc comics, crime fighter, terrorist",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"Action, Adventure, Science Fiction",http://movies.disney.com/john-carter,49529,"based on novel, mars, medallion",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


## 05. Regular expressions

* [a-z] - match any lowercase letter
* [A-Z] - match any uppercase letter
* [0-9] - match any digit
* [a-zA-Z0-9] - match any letter or digit

Adding the ^ symbol inside the square brackets matches any characters NOT in the set. So we have
* [^a-z] - match any character that is not a lowercase letter
* [^A-Z] - match any character that is not an uppercase letter
* [^0-9] - match any character that is not a digit
* [^a-zA-Z0-9] - match any character that is not a letter or digit

On top of this, we can use certain shorthand for specifying common sequences:
* \d - match any digit
* \D - match any non-digit
* \w - match any alphanumeric character (letter or digit) or an underscore (_)
* \W - match any character that is not alphanumeric or an underscore as described above
* \s - match whitespace (spaces, tabs, newlines, etc.)
* \S - match non-whitespace* 

In [72]:
s

0              0
1      John Wood
2    Colin Welsh
3        my list
4          02456
5            NaN
6    HELLO WORLD
7         water%
dtype: object

In [78]:
s[s.str.contains('[0-9]', na=False)]

0        0
4    02456
dtype: object

In [74]:
s.str.contains('John') | s.str.contains('Colin') #pareil que
s.str.contains('John|Colin')

0    False
1     True
2     True
3    False
4    False
5      NaN
6    False
7    False
dtype: object

In [75]:
s2 = pd.Series(['bar', 'sugar', 'cartoon', 'argon'])

In [76]:
s2.str.contains('.ar')

0     True
1     True
2     True
3    False
dtype: bool

In [77]:
# Matching sets of characters
s2.str.contains('[bc]ar') 

0     True
1    False
2     True
3    False
dtype: bool

In [80]:
s2[s2.str.contains('^[bc]', na=False)]

0        bar
2    cartoon
dtype: object

In [79]:
s2[s2.str.contains('ar$', na=False)]

0      bar
1    sugar
dtype: object

### Matching preceding characters

Often we want to mention a certain character and then ask to match one or more copies of this character. We can do this using the following metacharacters
* \* - match zero or more copies of the preceding character
* ? - match zero or 1 copy of the preceding character
* \+ - match 1 or more copies of the preceding character

Or we can use curly braces to specify how many time we want to match the given character. We have the following choices
* {m} - match the preceding element m times
* {m,} - match the preceding element m times or more
* {m,n} - match the preceding element between m and n times

In [81]:
s3= pd.Series(['forest', 'o', 'ff', 'foo', 'fof'])
s3.str.contains('f+o?f+')

0    False
1    False
2     True
3    False
4     True
dtype: bool

### Grouping

In [83]:
s4= pd.Series(['Monday5km', 'Wednesday10km', 'Saturday25km'])

In [86]:
s4.str.extract('(\w+day)',expand=True)


Unnamed: 0,0
0,Monday
1,Wednesday
2,Saturday


## 06. Exercise: using regular expressions in pandas

In [89]:
meal_plan = ['Monday: 9:12am – Omelet,  3:30pm– Apple slices with almond butter', 
             'Tuesday: 9:35am – Banana bread, 11:00am –Sauteed veggies, 7:02pm– Taco pie',
             'Wednesday: 9:00am – Banana pancakes',  
             'Thursday: 7:23pm– Slow cooker pulled pork', 'Friday: 3:30pm – Can of tuna', 
             'Saturday: 9:11am: Eggs and sweet potato hash browns, 3:22pm: Almonds', 
             'Sunday: 11:00am: Meat and veggie stir fry'] 

In [90]:
df = pd.DataFrame(meal_plan, columns=['text'])
df

Unnamed: 0,text
0,"Monday: 9:12am – Omelet, 3:30pm– Apple slices..."
1,"Tuesday: 9:35am – Banana bread, 11:00am –Saute..."
2,Wednesday: 9:00am – Banana pancakes
3,Thursday: 7:23pm– Slow cooker pulled pork
4,Friday: 3:30pm – Can of tuna
5,Saturday: 9:11am: Eggs and sweet potato hash b...
6,Sunday: 11:00am: Meat and veggie stir fry


In [98]:
sol = df['text'].str.extractall('(\d?\d):(\d\d) ?([ap]m)')
sol

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,9,12,am
0,1,3,30,pm
1,0,9,35,am
1,1,11,0,am
1,2,7,2,pm
2,0,9,0,am
3,0,7,23,pm
4,0,3,30,pm
5,0,9,11,am
5,1,3,22,pm


In [None]:
days= []

## 07. Sentiment analysis

In [101]:
df = pd.read_csv('data/tweets.csv', header=None)
df.columns = ['sentiment','text']
df.head()

Unnamed: 0,sentiment,text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...


In [103]:
#total number of tweets
df.shape[0]
#498
#number of positive tweets
df[df['sentiment']==4].shape[0]
#182
#number of neutral tweets
df[df['sentiment']==2].shape[0]
#139
#number of negative tweets
df[df['sentiment']==0].shape[0]
#177
pos_tweets = df.loc[df['sentiment']==4,'text']
neg_tweets = df.loc[df['sentiment']==0,'text']

In [106]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import string
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\valen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [108]:
stopwords_english = stopwords.words('english')
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
    # Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

In [110]:
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)

def clean_tweets(tweet):

    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/[^\s]+', '', tweet)

    # remove hashtags
    tweet = re.sub(r'#', '', tweet)


    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
            word not in emoticons and # remove emoticons
            word not in string.punctuation): # remove punctuation
            tweets_clean.append(word)   
    return tweets_clean

In [111]:
sample = pos_tweets.iloc[4]
sample
#"@mikefish  Fair enough. But i have the Kindle2 and I think it's perfect  :)"

clean_tweets(sample)
#['fair', 'enough', 'kindle', '2', 'think', 'perfect']

['fair', 'enough', 'kindle', '2', 'think', 'perfect']

In [112]:
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

In [113]:
bag_of_words(sample)

{'fair': True,
 'enough': True,
 'kindle': True,
 '2': True,
 'think': True,
 'perfect': True}

In [120]:
# positive tweets feature set
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    

#negative tweets feature set
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))

tweets = pos_tweets_set + neg_tweets_set 

### Implementation

In [121]:
from random import shuffle 
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)

test_set = pos_tweets_set[:36] + neg_tweets_set[:36]
train_set = pos_tweets_set[36:] + neg_tweets_set[36:]

In [122]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_set)

In [124]:
accuracy = classify.accuracy(classifier, test_set)
#0.7916666666666666

In [125]:
classifier.show_most_informative_features(10) 

Most Informative Features
                       2 = True              pos : neg    =     11.3 : 1.0
                  kindle = True              pos : neg    =      8.0 : 1.0
                    hate = True              neg : pos    =      7.9 : 1.0
                    love = True              pos : neg    =      7.1 : 1.0
                    time = True              neg : pos    =      6.8 : 1.0
                 awesome = True              pos : neg    =      6.8 : 1.0
                   phone = True              neg : pos    =      5.2 : 1.0
                   still = True              neg : pos    =      5.2 : 1.0
                     one = True              pos : neg    =      4.8 : 1.0
                    want = True              pos : neg    =      4.2 : 1.0


In [126]:
from collections import defaultdict
from nltk.metrics import ConfusionMatrix

actual_set = defaultdict(set)
predicted_set = defaultdict(set)

actual_set_cm = []
predicted_set_cm = []

for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    actual_set_cm.append(actual_label)

    predicted_label = classifier.classify(feature)

    predicted_set[predicted_label].add(index)
    predicted_set_cm.append(predicted_label)

print(ConfusionMatrix(actual_set_cm, predicted_set_cm))  

    |  n  p |
    |  e  o |
    |  g  s |
----+-------+
neg |<31> 5 |
pos |  2<34>|
----+-------+
(row = reference; col = test)

