In [8]:
import numpy as np
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords,wordnet
from nltk import pos_tag
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [9]:
df_train = pd.read_csv("training_twitter_x_y_train.csv")
df_test = pd.read_csv("test_twitter_x_test.csv")

In [10]:
Y_train = np.array(df_train["airline_sentiment"])
X_train = np.array(df_train["text"])

X_test = np.array(df_test["text"])

In [11]:
stop_words = set(stopwords.words('english'))
punctuations_list = list(string.punctuation)
stop_words.update(punctuations_list)

In [23]:
## Remember it as an example...Its Important 
from nltk import pos_tag
w = "Amazing"
print(pos_tag(w))
print(pos_tag([w]))

[('A', 'DT'), ('m', 'NN'), ('a', 'DT'), ('z', 'NN'), ('i', 'NN'), ('n', 'VBP'), ('g', 'NN')]
[('Amazing', 'VBG')]


In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
def get_simple_pos(w):
    if w.startswith('J'):
        return wordnet.ADJ
    elif w.startswith('V'):
        return wordnet.VERB
    elif w.startswith('N'):
        return wordnet.NOUN
    elif w.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_the_data(text):
    tokenized_words = word_tokenize(text)
    output_words_list =[]
    for word in tokenized_words:
        if word.lower() not in stop_words:
            pos = pos_tag([word])  ## Here, we should pass the word in the form of list as discussed above
            clean_word = lemmatizer.lemmatize(word,pos = get_simple_pos(pos[0][1]))
            output_words_list.append(clean_word.lower())
    return " ".join(output_words_list)

In [14]:
X_train = [clean_the_data(review) for review in X_train]

In [15]:
X_test = [clean_the_data(review) for review in X_test]

In [16]:
count_vec = CountVectorizer(max_features = 3000)
X_train_transformed = count_vec.fit_transform(X_train)
# print(X_train_transformed.todense())
# print(count_vec.get_feature_names())

In [17]:
x1,x2,y1,y2 = train_test_split(X_train_transformed,Y_train)

In [18]:
## Using grid_search to find best C & gamma values
clf = SVC()
grid = {'C':[1e2, 1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [1e-3, 5e-4, 1e-4,5e-3]}
cv_object = GridSearchCV(clf,grid)
cv_object.fit(x1, y1)
cv_object.best_estimator_

SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [19]:
clf = SVC(C=100,gamma=0.001)  ## From the above grid_search, we see that C=100 & gamma=0.001 will work best for us
clf.fit(x1, y1)
Y_pred = clf.predict(x2)
print("Accuracy Score : ",clf.score(x2,y2))
print("Confusion Matrix : ",confusion_matrix(y2,Y_pred))
print("Classification Report : ",classification_report(y2,Y_pred))

Accuracy Score :  0.7868852459016393
Confusion Matrix :  [[1537  141   50]
 [ 185  327   54]
 [  82   73  296]]
Classification Report :                precision    recall  f1-score   support

    negative       0.85      0.89      0.87      1728
     neutral       0.60      0.58      0.59       566
    positive       0.74      0.66      0.70       451

    accuracy                           0.79      2745
   macro avg       0.73      0.71      0.72      2745
weighted avg       0.78      0.79      0.78      2745



   ## Now predicting for the given test dataset

In [20]:
count_vec = CountVectorizer(max_features = 3000)
X_train_transformed = count_vec.fit_transform(X_train)
X_test_transformed = count_vec.transform(X_test)

In [21]:
clf = SVC(C=100,gamma=0.001)
clf.fit(X_train_transformed,Y_train)
Y_pred = clf.predict(X_test_transformed)
#print(Y_pred)

In [22]:
## Saving the predicions to the external csv file
file = open('Twitter_sentiment_analysis_predictions_file_using_SVM.csv', 'w') 
for i in range(len(Y_pred)):
    file.write(f"{Y_pred[i]}\n")