In [2]:
import preprocessing as pp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import pos_tag, word_tokenize
from nltk.probability import FreqDist
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

def load_data(full = True):
    """
    Loads the Twitter data.
    
    Args:
    full (bool): if False, loads only a part of the data
    
    Returns:
    tweets (pandas dataframe): positive and negative tweets with labels
    test_data: unlabelled data for testing
    """
    FULL = ''  
    if full:
        FULL = '_full'
        
    POS_TWEETS = 'train_pos.txt'
    NEG_TWEETS = 'train_neg.txt'
    TEST_DATA = 'test_data.txt'
    
    with open(POS_TWEETS) as file:
        pos_tweets_data = [line.rstrip() for line in file]
    pos_tweets = pd.DataFrame(pos_tweets_data, columns=['body'])
    pos_tweets['label'] = 1

    with open(NEG_TWEETS) as file:
        neg_tweets_data = [line.rstrip() for line in file]
    neg_tweets = pd.DataFrame(neg_tweets_data, columns=['body'])
    neg_tweets['label'] = -1

    with open(TEST_DATA) as file:
        # removes id at the same time
        test_data = [line.rstrip().split(',', 1)[1] for line in file]

    test_data = pd.DataFrame(test_data, columns=['body'])

    # merge positive and negative datasets
    tweets = pd.concat([pos_tweets, neg_tweets], axis = 0)
    
    return tweets, test_data

tweets_raw, test_data_raw = load_data(full = False)

tweets_raw

Unnamed: 0,body,label
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,""" <user> just put casper in a box ! "" looved t...",1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1
...,...,...
99995,can't wait to fake tan tonight ! hate being pale,-1
99996,<user> darling i lost my internet connection ....,-1
99997,kanguru defender basic 4 gb usb 2.0 flash driv...,-1
99998,rizan is sad now,-1


In [5]:
tweets = tweets_raw.copy()
tweets['body'] = pp.preprocess_data(tweets['body'])
print(tweets)

                                                    body  label
0      not know justin read mention not only justin g...      1
1                logic dumb not even crop name photo tsk      1
2      put casper box ! looved battle ! <hashtag> cra...      1
3      thanks sir not trip lil mama .. keep doin ya t...      1
4      visit brother tmr bestest birthday gift eveerr...      1
...                                                  ...    ...
99995           cannot wait fake tan tonight ! hate pale     -1
99996  darling lose internet connection .. seem not c...     -1
99997  kanguru defender basic <number> gb usb <number...     -1
99998                                          rizan sad     -1
99999                             no text back ? yea mad     -1

[200000 rows x 2 columns]


In [6]:
# logistic regression using tf-idf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression

def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer(analyzer = 'word', ngram_range=(1,1), min_df = 1, max_features = 10000)

    train = tfidf_vectorizer.fit_transform(data)

    return train, tfidf_vectorizer

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [7]:
list_corpus = tweets["body"].tolist()
list_labels = tweets["label"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, 
                                                                                random_state=40)
X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)

X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

clf_tfidf = LogisticRegression(C=1.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', n_jobs=-1, random_state=40)
clf_tfidf.fit(X_train_tfidf, y_train)

y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)

accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_tfidf, precision_tfidf, 
                                                                       recall_tfidf, f1_tfidf))

accuracy = 0.797, precision = 0.797, recall = 0.797, f1 = 0.797


In [8]:
# random forest classifier
# accuracy obtained with 10000 features: 0.802

from sklearn.ensemble import RandomForestClassifier
rf_tfidf = RandomForestClassifier(criterion='gini', n_estimators=1000,
                               random_state=1, n_jobs=-1)
rf_tfidf.fit(X_train_tfidf, y_train)

y_predicted_rf_tfidf = rf_tfidf.predict(X_test_tfidf)

accuracy_rf_tfidf, precision_rf_tfidf, recall_rf_tfidf, f1_rf_tfidf = get_metrics(y_test, y_predicted_rf_tfidf)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_rf_tfidf, precision_rf_tfidf, 
                                                                       recall_rf_tfidf, f1_rf_tfidf))

accuracy = 0.802, precision = 0.803, recall = 0.802, f1 = 0.802


In [None]:
# accuracy obtained with 10000 features: 0.815
# takes some time to count
from sklearn.svm import SVC

svm_tfidf = SVC(kernel = 'rbf', C = 10.0, random_state=1, gamma=2)

svm_tfidf.fit(X_train_tfidf, y_train)
y_predicted_svm_tfidf = svm_tfidf.predict(X_test_tfidf)

accuracy_svm_tfidf, precision_svm_tfidf, recall_svm_tfidf, f1_svm_tfidf = get_metrics(y_test, y_predicted_svm_tfidf)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_svm_tfidf, precision_svm_tfidf, 
                                                                       recall_svm_tfidf, f1_svm_tfidf))

In [None]:
#takes too much time to count, even with 500 features instead of 10000
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

pipe_svc = make_pipeline(StandardScaler(with_mean=False),
                        SVC(random_state = 1))
param_range = [0.1, 1.0, 10.0, 100.0]
param_grid = [{'svc__C': param_range,
              'svc__kernel': ['linear']},
              {'svc__C': param_range,
             'svc__gamma': param_range,
             'svc__kernel': ['rbf']}]  
gs = GridSearchCV(estimator = pipe_svc,
                 param_grid = param_grid,
                 scoring = 'accuracy',
                 cv = 10,
                 n_jobs=-1)
gs = gs.fit(X_train_tfidf, y_train)
print(gs.best_score_)