In [13]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split as sk_split
from sklearn import linear_model

np.set_printoptions(threshold='nan')

In [20]:
training_data = pd.read_csv('datasets/tweets.csv', delimiter=',')
testing_data = pd.read_csv('datasets/tweets2.csv', delimiter=',')

clinton_data_1 = training_data[training_data.handle == "HillaryClinton"]["text"].as_matrix()
trump_data_1 = training_data[training_data.handle == "realDonaldTrump"]["text"].as_matrix()

clinton_data_2 = testing_data[testing_data.handle == "HillaryClinton"]["text"].as_matrix()
trump_data_2 = testing_data[testing_data.handle == "realDonaldTrump"]["text"].as_matrix()

clinton_data = np.concatenate((clinton_data_1, clinton_data_2), axis=0)
trump_data = np.concatenate((trump_data_1, trump_data_2), axis=0)

clinton_dataset = []
trump_dataset = []

def loadCharsFromTxt(text, dataset):
    image = np.zeros((140, 37))
    words = text.lower().split()
    
    i = 0
    while i < len(words): 
        if "t.co" in words[i]:
            del words[i]
        i += 1
    
    text = " ".join(words)
            
    charCount = 0
    for char in text:
        index = None
        if char.isalpha():
            index = ord(char) - ord('a') + 1
        elif char.isdigit():
            index = ord(char) - ord('0') + 27
        elif char == " ":
            index = 0
                             
        if index:
            image[charCount, index] = 1
            charCount += 1
                    
    dataset.append(image)

for text in clinton_data:
    loadCharsFromTxt(text, clinton_dataset)

for text in trump_data:
    loadCharsFromTxt(text, trump_dataset)
    
clinton_y = np.zeros(len(clinton_dataset))
trump_y = np.full(len(trump_dataset), 1)

x_data = np.concatenate((clinton_dataset, trump_dataset), axis=0)
y_data = np.concatenate((clinton_y, trump_y), axis=0)

x_train, x_test, y_train, y_test = sk_split(x_data, y_data, test_size = 0.25, random_state = 42)

print x_train.shape
print y_train.shape

(9666, 140, 37)
(9666,)


In [15]:
def loadDictionary(text, wc):
    words = text.lower().split()
    for word in words:
        if "t.co" not in word:
            i = 0
            editedWord = word
            while i < len(word):
                if not word[i].isalpha() and not word[i].isdigit():
                    editedWord = word[:i] + word[i+1:]
                i += 1
                
            if word not in wordDictionary:
                wordDictionary[word] = wc
                wc += 1
    
    return wc
                
def bagOfWords(text, dataset, wc):
    bag = np.zeros(wc)
    for word in text.lower().split():
        # ignore t.co links
        if "t.co" not in word:
            i = 0
            
            # ignore punctuation
            editedWord = word
            while i < len(word):
                if not word[i].isalpha() and not word[i].isdigit():
                    editedWord = word[:i] + word[i+1:]
                i += 1

            if word in wordDictionary:
                bag[wordDictionary[word]] += 1
    dataset.append(bag)
    
clinton_data = np.array([np.array([text, 0]) for text in clinton_data])
trump_data = np.array([np.array([text, 1]) for text in trump_data])

data = np.concatenate((clinton_data, trump_data), axis=0)

x_train, x_test, y_train, y_test = sk_split(data[:, 0], data[:, 1], test_size = 0.25, random_state = 42)

wordDictionary = {}
wordCount = 0
            
for text in x_train:
    wordCount = loadDictionary(text, wordCount)

trainingBags = []
testingBags = []
for text in x_train:
    bagOfWords(text, trainingBags, wordCount)
    
for text in x_test:
    bagOfWords(text, testingBags, wordCount)

model = linear_model.LogisticRegression()
model.fit(trainingBags, y_train)
print model.score(trainingBags, y_train)
print model.score(testingBags, y_test)

0.997413614732
0.98417132216


In [21]:
def bagOfChars(text, dataset, cc):
    bag = np.zeros(cc)
    words = text.lower().split()
    
    i = 0
    while i < len(words): 
        if "t.co" in words[i]:
            del words[i]
        i += 1
    
    text = " ".join(words)
            
    charCount = 0
    for char in text:
        index = None
        if char.isalpha():
            index = ord(char) - ord('a') + 1
        elif char.isdigit():
            index = ord(char) - ord('0') + 27
        elif char == " ":
            index = 0
                             
        if index:
            bag[index] += 1
    
    dataset.append(bag)

clinton_data = np.array([np.array([text, 0]) for text in clinton_data])
trump_data = np.array([np.array([text, 1]) for text in trump_data])

data = np.concatenate((clinton_data, trump_data), axis=0)

x_train, x_test, y_train, y_test = sk_split(data[:, 0], data[:, 1], test_size = 0.25, random_state = 42)

charCount = 37

trainingBags = []
testingBags = []
for text in x_train:
    bagOfChars(text, trainingBags, charCount)
    
for text in x_test:
    bagOfChars(text, testingBags, charCount)

model = linear_model.LogisticRegression()
model.fit(trainingBags, y_train)
print model.score(trainingBags, y_train)
print model.score(testingBags, y_test)

0.689116490792
0.690564866543
