In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import svm

# import nltk
# import matplotlib.pyplot as plt
# from torchtext.vocab import GloVe
# from wordcloud import WordCloud

In [2]:
data_type = {"date": "string", "id": "string", "coordinates": "string", "full_text":"string", "sentiment": "int", "key_words":"string", "hashtags":"string"}
data = pd.read_csv("tweet_data.csv", dtype=data_type)

X, y = data["full_text"], data["sentiment"]
y = y.replace({1: 0, -2: -1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

In [18]:
steps = [('tfidf', TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_df=0.6, min_df=5)), 
         ('clf', svm.SVC(kernel='sigmoid', C=1, gamma=1))]
model = Pipeline(steps)
model.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.6, min_df=5, ngram_range=(1, 2),
                                 stop_words='english')),
                ('clf', SVC(C=1, gamma=1, kernel='sigmoid'))])

In [19]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

def print_accuracy(name, test, pred):
    print("{:17} {:.4}%".format(name + " F1:", f1_score(test, pred, average='macro') * 100))
    print("{:17} {:.4}%".format(name + " accuracy:", accuracy_score(test, pred) * 100))
    
print_accuracy("train", y_train, y_train_pred)
print_accuracy("test", y_test, y_test_pred)

train F1:         82.37%
train accuracy:   82.89%
test F1:          72.94%
test accuracy:    73.43%


In [13]:
# parameter tuning

steps = [('tfidf', TfidfVectorizer()), 
         ('clf', svm.SVC())]
pipeline = Pipeline(steps)
parameters = {'tfidf__ngram_range': ((1,2),),
              'tfidf__max_df': (0.3, 0.4, 0.5, 0.6),
              'tfidf__min_df': (5, 7, 9),
              'clf__C': (1,),
              'clf__gamma': (1,)}

clf = GridSearchCV(pipeline, parameters, verbose=1)
clf.fit(X_train, y_train)

clf.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  2.5min finished


{'clf__C': 1,
 'clf__gamma': 1,
 'tfidf__max_df': 0.6,
 'tfidf__min_df': 5,
 'tfidf__ngram_range': (1, 2)}

In [9]:
clf.best_params_

{'clf__C': 1,
 'clf__gamma': 1,
 'tfidf__max_df': 0.49,
 'tfidf__min_df': 7,
 'tfidf__ngram_range': (1, 2)}