In [1]:
%load_ext autoreload

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.tokenize import TweetTokenizer
from sklearn.svm import LinearSVC
import helpers



In [2]:
# Load training set
pos_filename = 'twitter-datasets/train_pos_full_u.txt'
neg_filename = 'twitter-datasets/train_neg_full_u.txt'
pos_tweets = helpers.txt_to_list(pos_filename)
neg_tweets = helpers.txt_to_list(neg_filename)

# Create a labeled dataset 
all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets)

# Split into train and validation sets
training_fraction = 0.8
train, val, y_train, y_val = helpers.split_dataset(training_fraction, all_tweets, y)

### Compute TF-IDF

In [3]:
# Build training vectorization 
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = TweetTokenizer().tokenize) 
X_train = tfidf_vectorizer.fit_transform(train)
X_val = tfidf_vectorizer.transform(val)

In [4]:
first_vector_tfidfvectorizer=X_train[0] 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)


Unnamed: 0,tfidf
continue tomorrow,0.245828
go can,0.241714
we continue,0.230877
8 so,0.229880
exam at,0.213982
...,...
dances round,0.000000
dances out,0.000000
dances on,0.000000
dances of,0.000000


### Test a few classifiers on validation set

In [5]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 86.47% / validation set: 85.01%


In [6]:
from sklearn.naive_bayes import MultinomialNB


    

clf = MultinomialNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 86.71% / validation set: 81.65%


In [7]:
from sklearn.naive_bayes import BernoulliNB


    

clf = BernoulliNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [8]:
from sklearn.naive_bayes import BernoulliNB


clf = BernoulliNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [9]:
from sklearn import linear_model

clf =  linear_model.SGDClassifier(loss = 'log', max_iter=int(1e7), tol=1e-5, verbose = False)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 78.49% / validation set: 78.48%


In [10]:
from sklearn import linear_model

clf =  linear_model.Perceptron()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 96.80% / validation set: 81.79%


In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)  
clf.fit(X_train, y_train)


helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

MemoryError: Unable to allocate 397. MiB for an array with shape (52029241,) and data type float64

### Predictions

In [None]:
# Prepare test set
test_tweets = []
with open('twitter-datasets/test_data.txt', encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')
        index = sp[0]
        test_tweets.append(','.join(sp[1:]))
        
# Compute tf-idf on full training set      
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = TweetTokenizer().tokenize) 
X_train_final = tfidf_vectorizer.fit_transform(all_tweets)
X_test = tfidf_vectorizer.transform(test_tweets)

# Check training accuracy
clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train_final, y)

train_acc = (clf.predict(X_train_final) == y).mean()
print('Training set accuracy: {:.2f}%'.format(100*train_acc))

In [None]:

# Save predictions
save_filename = 'submissions/submission_tfidf.csv'
predictions = clf.predict(X_test)
helpers.save_pred(save_filename, predictions)

In [None]:
np.save('embeddings/tfidf_unique_full_train.npy', X_train_final)
np.save('embeddings/tfidf_unique_full_labels.npy', y)
np.save('embeddings/tfidf_unique_full_test.npy', X_test)

In [None]:
X_train_final = np.load('embeddings/tfidf_unique_full_train.npy', allow_pickle = True)
y = np.load('embeddings/tfidf_unique_full_labels.npy', allow_pickle = True)
X_test = np.load('embeddings/tfidf_unique_full_test.npy', allow_pickle = True)

In [None]:
X_train_final