In [27]:
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
import pandas as pd

data = pd.read_csv(r"./Tweets.csv")
data.head(5)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [28]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["sentiment"], test_size=0.2)

len(X_train), len(X_test)


(21984, 5497)

In [29]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
    tokens = " ".join(tokens)
    return tokens

X_train = X_train.fillna("").apply(preprocess)
X_test = X_test.fillna("").apply(preprocess)
print(X_train[:5])

vectorizer = TfidfVectorizer(sublinear_tf = True,
                             use_idf = True)

X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

X_train_vectors.shape

24280    got talent u nvr hear sing dance call mini breezy
26679                               lot noisy peep outside
18690                             going hate around baby ?
14709                           oh boy going loooooong day
7717                                       ` stop coughing
Name: text, dtype: object


(21984, 21399)

In [30]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(X_train_vectors, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print(f"Training time: {time_linear_train}s; Prediction time: {time_linear_predict}s")
report = classification_report(y_test, prediction_linear, output_dict=True)
print('positive: ', report['positive'])
print('negative: ', report['negative'])
print('neutral: ', report['neutral'])

Training time: 22.039151191711426s; Prediction time: 2.6135427951812744s
positive:  {'precision': 0.7925925925925926, 'recall': 0.7278911564625851, 'f1-score': 0.7588652482269503, 'support': 1764.0}
negative:  {'precision': 0.740916271721959, 'recall': 0.615485564304462, 'f1-score': 0.6724014336917563, 'support': 1524.0}
neutral:  {'precision': 0.6487935656836461, 'recall': 0.7668628338614758, 'f1-score': 0.7029045643153526, 'support': 2209.0}
