# Term-frequency inverse documentation frequency
## Classification with Linear SVC

## Reproducibility
After running this notebook, you will obtain the model used for Submission **#109900** on AIcrowd

| Accuracy | F1 |
|:---:|:---:|
| 86.2% | 86.5% |

### Import modules and download dataset

In [2]:
%load_ext autoreload

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.svm import LinearSVC
from nltk.tokenize import TweetTokenizer
import helpers
from preprocessing import process_sentence, to_vec, split_hashtag, remove_repeats, remove_informal_contractions
import pickle
import os 
import wget
root = 'data/'

os.makedirs(root, exist_ok=True)

seed = 0
    
# Download negative full
neg_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQ0eDZMdDI5WXBlVXYyZGc_ZT1ZZDJn/root/content'
neg_filename = root + 'train_neg_full_u.txt'
wget.download(neg_url, neg_filename)
neg_tweets = helpers.txt_to_list(neg_filename)

# Download positive full
pos_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQzcTc3QmNPbUdIWHQ3TXc_ZT01ejdG/root/content'
pos_filename = root + 'train_pos_full_u.txt'
wget.download(pos_url, pos_filename)
pos_tweets = helpers.txt_to_list(pos_filename)

# Create a labeled dataset 
all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets, seed = 0)

# Prepare test set
test_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDR5Q3hoWXM4T2FJd1JLenc_ZT1hSXh0/root/content'
test_filename = root + 'test.txt'
wget.download(test_url, test_filename)

test_tweets = []
with open(test_filename, encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')
        index = sp[0]
        test_tweets.append(','.join(sp[1:]))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
100% [............................................................................] 817297 / 817297

### Set up TF-IDF setting and pre-processing

In [3]:
preproc_pipeline = [to_vec(split_hashtag),  
                    to_vec(remove_repeats)]
def tk(sent):
    """ Tokenize a tweet.
    
    Parameters
    ----------
        sent: string
            a tweet
        
    Returns
    -------
        tokens: list of strings
            a tokenized version of the string
    """
    tokens = TweetTokenizer().tokenize(sent)
    tokens = process_sentence(tokens, preproc_pipeline)
    return tokens

vect = TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = tk, sublinear_tf = True)

### Compute tf-idf on full training set and transform test set   

In [4]:
X_train_final = vect.fit_transform(all_tweets)
X_test = vect.transform(test_tweets)

# Save the fitted vectorizer
save_filename = root + 'tf-idf_fitted_vectorizer.pkl'
with open(save_filename, 'wb') as file:
    pickle.dump(vect, file)

### Train a Linear Support Vector Classifier

In [5]:
clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train_final, y)

# Save the trained classifier
save_filename = root + 'tf-idf_trained_linearSVC.pkl'
with open(save_filename, 'wb') as file:
    pickle.dump(clf, file)

### Check training accuracy

In [6]:
train_acc = (clf.predict(X_train_final) == y).mean()
print('Training set accuracy: {:.2f}%'.format(100*train_acc))

Training set accuracy: 86.72%


### Save predictions

In [7]:
save_filename = 'submission_tfidf.csv'
predictions = clf.predict(X_test)
helpers.save_pred(save_filename, predictions)