In [1]:
%matplotlib inline

First we need to import data into pandas dataframe

In [2]:
import pandas
df = pandas.read_csv('trainingData.txt', sep='\t', encoding='latin1', index_col='ID')
df.head()

Unnamed: 0_level_0,Target,Tweet,Stance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101,Atheism,dear lord thank u for all of ur blessings forg...,AGAINST
102,Atheism,"Blessed are the peacemakers, for they shall be...",AGAINST
103,Atheism,I am not conformed to this world. I am transfo...,AGAINST
104,Atheism,Salah should be prayed with #focus and #unders...,AGAINST
105,Atheism,And stay in your houses and do not display you...,AGAINST


Then extract the topics (for now just focus on hillary)


In [3]:
listOfTopics = []
for row in df['Target']:
    if row not in listOfTopics:
        listOfTopics.append(row)
listOfTopics

['Atheism',
 'Climate Change is a Real Concern',
 'Feminist Movement',
 'Hillary Clinton',
 'Legalization of Abortion']

Create a function to turn the stance into usable values

In [4]:
def yVec(text):
    if text == 'AGAINST':
        return -1
    elif text == 'FAVOR':
        return 1
    else:
        return 0

Extract topic rows

In [5]:
trainDF = df.loc[df['Target'] == 'Hillary Clinton']
trainDF.describe()

Unnamed: 0,Target,Tweet,Stance
count,639,639,639
unique,1,639,3
top,Hillary Clinton,Hillary is on her way to #Baltimore to be thei...,AGAINST
freq,639,1,361


Make corpus from tweets

In [6]:
corpus = []
Y = []
for index, row in trainDF.iterrows():
    corpus.append(row['Tweet'][:-6])
    Y.append(yVec(row['Stance']))

Generate ngrams of size 2-4.  Drop anything with less then 3 occurences 

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer

ngram_vectorizer = CountVectorizer(ngram_range=(1, 7), analyzer='word', stop_words='english', min_df=20, tokenizer=lambda x: TweetTokenizer().tokenize(x))

X = ngram_vectorizer.fit_transform(corpus).toarray()

So what does a single tweet now look like?  Following the progression we've done

In [8]:
print('Tweet: \n', corpus[0])
print(TweetTokenizer().tokenize(corpus[0]))
print('---------------------------------------------------------------------------------\n')
print('Total features from corpus: ', len(ngram_vectorizer.get_feature_names()))
print('First 20: \n', ngram_vectorizer.get_feature_names()[:20])
print('---------------------------------------------------------------------------------\n')
print('Our tweet represented as a vector: \n')
print(X[0])
print('---------------------------------------------------------------------------------\n')
print('\nOur X: \n')
print('Shape: ', X.shape)

Tweet: 
 RT @GunnJessica: Because I want young American women to be able to be proud of the 1st woman president 
['RT', '@GunnJessica', ':', 'Because', 'I', 'want', 'young', 'American', 'women', 'to', 'be', 'able', 'to', 'be', 'proud', 'of', 'the', '1st', 'woman', 'president']
---------------------------------------------------------------------------------

Total features from corpus:  29
First 20: 
 ['!', '! !', '! ! !', '"', '#hillaryclinton', '#tcot', '#wakeupamerica', '&', ',', '-', '.', '...', ':', '?', '@hillaryclinton', "can't", 'clinton', 'hillary', 'hillary clinton', "i'm"]
---------------------------------------------------------------------------------

Our tweet represented as a vector: 

[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0]
---------------------------------------------------------------------------------


Our X: 

Shape:  (639, 29)


Time to apply a svm

In [12]:
testFullDF = pandas.read_csv('test.txt', sep='\t', encoding='latin1', index_col='ID')
testFullDF.head()

Unnamed: 0_level_0,Target,Tweet,Stance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Hillary Clinton,"@tedcruz And, #HandOverTheServer she wiped cle...",AGAINST
2,Hillary Clinton,Hillary is our best choice if we truly want to...,FAVOR
3,Hillary Clinton,@TheView I think our country is ready for a fe...,AGAINST
4,Hillary Clinton,I just gave an unhealthy amount of my hard-ear...,AGAINST
5,Hillary Clinton,@PortiaABoulger Thank you for adding me to you...,NONE


In [14]:
test_corpus = []
test_Y = []
for index, row in testFullDF.iterrows():
    test_corpus.append(row['Tweet'][:-6])
    test_Y.append(yVec(row['Stance']))

test_X = ngram_vectorizer.transform(test_corpus).toarray()

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
clf = svm.SVC(kernel='poly', C=1, random_state=1)
scores = cross_val_score(clf, X, Y, cv=15)


And our score on quick test set:

In [16]:
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.55555556  0.55813953  0.55813953  0.55813953  0.55813953  0.55813953
  0.55813953  0.57142857  0.57142857  0.57142857  0.57142857  0.57142857
  0.57142857  0.57142857  0.57142857]
Accuracy: 0.57 (+/- 0.01)


Trying adaboost

In [17]:
from sklearn.ensemble import AdaBoostClassifier

test = AdaBoostClassifier(svm.SVC(probability=True, kernel='poly'), n_estimators=50, learning_rate=1.0, algorithm='SAMME')
scores = cross_val_score(test, X, Y, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.56153846  0.5625      0.56692913  0.56692913  0.56692913]
Accuracy: 0.56 (+/- 0.00)
