### Imports

In [10]:
import pandas
import numpy
pd = pandas
np = numpy

#the classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#utilities
from nltk import word_tokenize
from nltk.corpus import stopwords
EnglishStopwords = stopwords.words('english')
import string
from nltk.stem import PorterStemmer
PStemmer = PorterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer
TFIDFV = TfidfVectorizer(stop_words  = 'english', use_idf =True)
from sklearn.metrics import precision_score, recall_score


In [11]:
TweetsFile = pandas.read_csv("./Tweets.csv")

Let's just take a quick look at the file

In [12]:
TweetsFile

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
5,570300767074181121,negative,1.0000,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)
6,570300616901320704,positive,0.6745,,0.0000,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2015-02-24 11:13:57 -0800,San Francisco CA,Pacific Time (US & Canada)
7,570300248553349120,neutral,0.6340,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,,2015-02-24 11:12:29 -0800,Los Angeles,Pacific Time (US & Canada)
8,570299953286942721,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D",,2015-02-24 11:11:19 -0800,San Diego,Pacific Time (US & Canada)
9,570295459631263746,positive,1.0000,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)


In [13]:
TweetsFile.shape

(14640, 15)

### Extracting the text and the sentiment.

In [14]:
Text = numpy.array(TweetsFile['text']) # a numpy array of strings

Sentiments = numpy.array(TweetsFile['airline_sentiment']) # a numpy array of sentiments

### Converting sentiments to numbers.

In [15]:
# converting sentiments to numeric sentiments
# switch
switcher = {
    'positive':1,
    'neutral':0,
    'negative':-1
}
NumericSentiments = numpy.array([switcher[s] for s in Sentiments])

### Tokenization

In [16]:
TokenizedText = [word_tokenize(t) for t in Text]
# TokenizedText

### Case Folding

In [17]:
Input = TokenizedText
CaseFolding = [ [t.lower() for t in TokenArray] for TokenArray in Input]
# CaseFolding

### Filtering Stopwords

In [18]:
Input = CaseFolding
FilterList = EnglishStopwords
FilteredStopwords = [ list(filter(lambda t:t not in FilterList, TokenArray)) for TokenArray in Input]
# FilteredStopwords

### Filtering Symbols

In [19]:
Input = FilteredStopwords
FilterFunction = lambda t: not True in [p in t for p in list(string.punctuation)+["”","“","’"]]
FilteredSymbols = [ list(filter(FilterFunction, TokenArray)) for TokenArray in Input]
# FilteredSymbols

### Stemming

In [22]:
#stemming
Input = FilteredSymbols
StemmedTokens = [ [PStemmer.stem(w) for w in TokenArray] for TokenArray in Input]
# StemmedTokens

### Removing URLs and Special Symbols

In [23]:
Input = StemmedTokens
FilterFunction = lambda t: ("'" not in t) and ("/" not in t)
NoURLsSymbols = [ list(filter(FilterFunction, TokenArray)) for TokenArray in Input]
# NoURLsSymbols

Function to split data to training and testing

In [24]:
#split(Text, NumericSentiments, 0.8)
def split(xData, yData, portions):
    #from Saleh's work
#     trainingPortion = int(xData.shape[0]*portions)
    trainingPortion = int(len(xData)*portions)
    xTraining = xData[:trainingPortion]
    xTesting = xData[trainingPortion:]
    yTraining = yData[:trainingPortion]
    yTesting = yData[trainingPortion:]
    return(xTraining, xTesting, yTraining, yTesting)

Rejoining tweets. Splitting them to training and testing.

In [25]:
TweetsRejoined = [ " ".join(t) for t in NoURLsSymbols ]
xtr, xts, ytr, yts = split(TweetsRejoined, NumericSentiments, 0.8)

Vectorizing the input data

In [26]:
xtrTransformed = TFIDFV.fit_transform(xtr)
xtsTransformed = TFIDFV.transform(xts)
#shorthands
xtrt = xtrTransformed
xtst = xtsTransformed

# Naive Bayes Classifier

In [27]:
classifier1 = MultinomialNB()
classifier1fitted = classifier1.fit(xtrTransformed, ytr)

In [28]:
classifier1predictions = classifier1fitted.predict(xtsTransformed)

In [29]:
#metrics!
NBPrecisionMacro = precision_score(yts, classifier1predictions, average='macro')
print(NBPrecisionMacro)
NBRecallMacro = recall_score(yts, classifier1predictions, average='macro')
print(NBRecallMacro)

NBPrecisionMicro = precision_score(yts, classifier1predictions, average='micro')
print(NBPrecisionMicro)
NBRecallMicro = recall_score(yts, classifier1predictions, average='micro')
print(NBRecallMicro)

0.8349194948733448
0.3643708318404049
0.7254098360655737
0.7254098360655737


In [30]:
def calculate_f1(p,r):
    return 2*r*p/(r+p)

In [31]:
print(calculate_f1(NBPrecisionMacro, NBRecallMacro))
print(calculate_f1(NBPrecisionMicro, NBRecallMicro))

0.5073338858662929
0.7254098360655736


# K nearest neighbors classifier with 2 neighbors

In [32]:
NN2 = KNeighborsClassifier(n_neighbors=2)

In [33]:
NN2Fitted = NN2.fit(xtrt, ytr)

In [34]:
NN2Predictions = NN2Fitted.predict(xtst)

In [35]:
def calculate_metrics(predictions, truths):
    pMacro = precision_score(truths, predictions, average='macro')
    rMacro = recall_score(truths, predictions, average='macro')
    pMicro = precision_score(truths, predictions, average='micro')
    rMicro = recall_score(truths, predictions, average='micro')
    return pMacro, rMacro, pMicro, rMicro

In [36]:
NN2PMacro, NN2RMacro, NN2PMicro, NN2RMicro = calculate_metrics(NN2Predictions, yts)

In [37]:
print(NN2PMacro)
print(NN2RMacro)
print(NN2PMicro)
print(NN2RMicro)

0.5755261745448245
0.4685154994072047
0.6912568306010929
0.6912568306010929


In [38]:
calculate_f1(NN2PMacro, NN2RMacro)

0.5165367241866934

In [39]:
calculate_f1(NN2PMicro, NN2RMicro)

0.6912568306010929

# Random Forest Classifier

In [40]:
classifier3 = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

In [41]:
classifier3fitted = classifier3.fit(xtrt, ytr)

In [42]:
classifier3predictions = classifier3fitted.predict(xtst)

In [43]:
c3PMacro, c3RMacro, c3PMicro, c3RMicro = calculate_metrics(classifier3predictions, yts)

  'precision', 'predicted', average, warn_for)


In [44]:
print(c3PMacro)
print(c3RMacro)
print(c3PMicro)
print(c3RMicro)

0.2375910746812386
0.3333333333333333
0.7127732240437158
0.7127732240437158


In [45]:
calculate_f1(c3PMacro, c3RMacro)

0.2774343635759388

In [46]:
calculate_f1(c3PMicro, c3RMicro)

0.7127732240437158

# <s>Using the Sentiment140 dataset</s>

In [6]:
# Sentiment140 = pandas.read_csv("./training.1600000.processed.noemoticon.csv", encoding='Latin-1')

In [459]:
Sentiment140

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
7,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
8,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?
9,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break in plain city... it's snowing


In [460]:
Sentiment140.shape

(1599999, 6)

In [468]:
S140Tweets = Sentiment140[Sentiment140.columns[5]]
S140Tweets

0          is upset that he can't update his Facebook by ...
1          @Kenichan I dived many times for the ball. Man...
2            my whole body feels itchy and like its on fire 
3          @nationwideclass no, it's not behaving at all....
4                              @Kwesidei not the whole crew 
5                                                Need a hug 
6          @LOLTrish hey  long time no see! Yes.. Rains a...
7                       @Tatiana_K nope they didn't have it 
8                                  @twittera que me muera ? 
9                spring break in plain city... it's snowing 
10                                I just re-pierced my ears 
11         @caregiving I couldn't bear to watch it.  And ...
12         @octolinz16 It it counts, idk why I did either...
13         @smarrison i would've been the first, but i di...
14         @iamjazzyfizzle I wish I got to watch it with ...
15         Hollis' death scene will hurt me severely to w...
16                      

In [469]:
S140Sentiments = Sentiment140[Sentiment140.columns[0]]
S140Sentiments

0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
24         0
25         0
26         0
27         0
28         0
29         0
          ..
1599969    4
1599970    4
1599971    4
1599972    4
1599973    4
1599974    4
1599975    4
1599976    4
1599977    4
1599978    4
1599979    4
1599980    4
1599981    4
1599982    4
1599983    4
1599984    4
1599985    4
1599986    4
1599987    4
1599988    4
1599989    4
1599990    4
1599991    4
1599992    4
1599993    4
1599994    4
1599995    4
1599996    4
1599997    4
1599998    4
Name: 0, Length: 1599999, dtype: int64

In [470]:
NPS140x = numpy.array(S140Tweets)
NPS140y = numpy.array(S140Sentiments)

In [471]:
NPS140x

array(["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
       '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
       'my whole body feels itchy and like its on fire ', ...,
       'Are you ready for your MoJo Makeover? Ask me for details ',
       'Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur ',
       'happy #charitytuesday @theNSPCC @SparksCharity @SpeakingUpH4H '],
      dtype=object)

In [472]:
NPS140y

array([0, 0, 0, ..., 4, 4, 4])

Function to preprocess S140

In [480]:
def preprocessingS140(X):
    Input = wt = [word_tokenize(t) for t in X]
    Input = cf = [ [t.lower() for t in TokenArray] for TokenArray in Input]
    FilterList = EnglishStopwords
    Input = fsw = [ list(filter(lambda t:t not in FilterList, TokenArray)) for TokenArray in Input]
    FilterFunction = lambda t: not True in [p in t for p in list(string.punctuation)+["”","“","’"]]
    Input = fsym = [ list(filter(FilterFunction, TokenArray)) for TokenArray in Input]
    Input = st = [ [PStemmer.stem(w) for w in TokenArray] for TokenArray in Input]
    FilterFunction = lambda t: ("'" not in t) and ("/" not in t)
    return [ list(filter(FilterFunction, TokenArray)) for TokenArray in Input]

In [None]:
PPS140x = preprocessingS140(NPS140x)

In [8]:
# zzzzzzzzzzzzzzzz
# testing with a smaller portion
# testx = NPS140x[:10000]
# preprocessingS140(testx)

# Using cosine similarity to eliminate similar documents

In [47]:
# using 'TweetsRejoined'
TweetsRejoined

['virginamerica dhepburn said',
 'virginamerica plu ad commerci experi tacki',
 'virginamerica today must mean need take anoth trip',
 'virginamerica realli aggress blast obnoxi entertain guest face amp littl recours',
 'virginamerica realli big bad thing',
 'virginamerica serious would pay 30 flight seat play realli bad thing fli va',
 'virginamerica ye nearli everi time fli vx ear worm go away',
 'virginamerica realli miss prime opportun men without hat parodi http',
 'virginamerica well',
 'virginamerica amaz arriv hour earli good',
 'virginamerica know suicid second lead caus death among teen',
 'virginamerica lt 3 pretti graphic much better minim iconographi',
 'virginamerica great deal alreadi think 2nd trip australia amp even gone 1st trip yet p',
 'virginamerica virginmedia fli fabul seduct sky u take stress away travel http',
 'virginamerica thank',
 'virginamerica schedul still mia',
 'virginamerica excit first cross countri flight lax mco heard noth great thing virgin americ

In [48]:
type(TweetsRejoined)

list

In [None]:
def FilterSimilarTweets(T):
    output = []
    while(len(T) != 0):
        tweet = T.pop(0)
        
    return output

In [56]:
a = b = [1,2,3,4]
b.pop(0)
print(a)
print(b)

[2, 3, 4]
[2, 3, 4]
