In [7]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer
from nltk.probability import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy


def process_lang(x):
    # Remove web addresses and non-letter characters and make lower case. Stematize and lematize words and remove stop words
    l = nltk.word_tokenize(re.sub("[\W\d]+"," ", re.sub("(https?://|www\.|@)\S*", "", x)).lower())
    lem, ste = WordNetLemmatizer(), EnglishStemmer()
    l = [ste.stem(lem.lemmatize(i)) for i in l]
    return [i for i in l if i not in stopwords.words('english')]


def create_classifier(f, n=0):
    # Creates a Native BayesClassifier from sample data f with number of rows n. If no value given for n, will use length of target dataset. 
    s = pd.read_csv(f)
    if n != 0 and n < len(s):
        s = s.sample(n).reset_index(drop=True)
    # Create new text column with web addresses and non-letter characters removed and made lower case. Stem and lematize words and remove stop words
    s["text_processed"] = s["text"].apply(lambda x: process_lang(x))
    # Create list of the 5000 most common words
    wf = FreqDist([i for j in list(s["text_processed"]) for i in j]).most_common(5000)
    words = [w[0] for w in wf]
    # Create feature table showing which words are contained in which tweets
    feat = [({w:w in s.loc[i,"text_processed"] for w in words}, s.loc[i,"target"]) for i in range(len(s))]
    # Split into train and test data (80% train)
    tr, te = feat[:int(len(s)*0.8)], feat[int(len(s)*0.8):]
    return NaiveBayesClassifier.train(tr), tr, te

location = "../../../../Other/Large Data/Sentiment140.csv"
classifier, train, test = create_classifier(location, 5000)

In [8]:
# Check Accuracy
accuracy(classifier, test)

0.709

In [10]:
# Check most informative features
classifier.show_most_informative_features(50)

Most Informative Features
                    hurt = True                0 : 4      =     15.2 : 1.0
                  welcom = True                4 : 0      =     13.5 : 1.0
                    wont = True                0 : 4      =     11.8 : 1.0
                     sad = True                0 : 4      =     10.2 : 1.0
                    poor = True                0 : 4      =      9.8 : 1.0
                     ugh = True                0 : 4      =      9.5 : 1.0
                    sick = True                0 : 4      =      9.3 : 1.0
                    alon = True                0 : 4      =      9.1 : 1.0
                 congrat = True                4 : 0      =      8.2 : 1.0
                    gone = True                0 : 4      =      7.9 : 1.0
                    shit = True                0 : 4      =      7.8 : 1.0
                  father = True                0 : 4      =      7.8 : 1.0
                    lost = True                0 : 4      =      7.7 : 1.0