## Import libraries

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import re
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from time import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from sklearn.base import BaseEstimator
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

## Load and clean the data

In [8]:
from os import name
df = pd.read_csv('/content/sentiment140.csv',encoding='latin-1', engine='python', on_bad_lines='skip', names=['popularity', 'tweet_id', 'tweet_date', 'query', 'user', 'text'])
df.head()

Unnamed: 0,popularity,tweet_id,tweet_date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


### Extract only the features and label and check for nulls

In [9]:
df = df[['popularity', 'text']]
#Check for nulls
df.isnull().sum()

Unnamed: 0,0
popularity,0
text,0


### Create the transformer like the modules

In [11]:
nlp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(sentence):
  tokens = nlp(sentence)
  tokens = [word.lemma_ for word in tokens if word not in stopwords]
  return tokens

class features(TransformerMixin):
  def transform(self, X, **transform_params):
    return [clean_text(text) for text in X]
  def fit(self, X, y=None, **fit_params):
    return self
  def get_params(self, deep=True):
    return {}
#define clean text function
def clean_text(text):
  #remove all crazy stuff and lower the text
  text = ' '.join(re.sub("(nan)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
  text = text.strip().lower()
  return text

## Encode text and split into training and test

In [12]:
le = LabelEncoder()
df['popularity'] = le.fit_transform(df['popularity'])
X = df['text']
y = df['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
print(f'X_train dimension: {X_train.shape}; y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}; y_train dimension: {y_test.shape}')

X_train dimension: (1225320,); y_train dimension: (1225320,)
X_test dimension: (306331,); y_train dimension: (306331,)


# Build the models

#Model 1: Use pipeline and SVM

In [None]:
#time the code
t0 = time()
#tokenizer
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
#create classifier
classifier = SVC()
#create the pipeline
pipeline = Pipeline(
    [
        ('cleaner', features()),
        ('vectorizer', tfidf),
        ('classifier', classifier)
    ]
)

#fit the model
pipeline.fit(X_train, y_train)

#time taken
print(f'Time taken: {np.round(time()-t0,2)} seconds')



## Look at results

In [None]:
target_names=['negative', 'neutral', 'positive']
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(pipeline, X_test, y_test,
                                 display_labels= target_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
# Predict the test data
y_pred = pipeline.predict(X_test)
# Print out the report
print(classification_report(y_test, y_pred, target_names = target_names))

## Results
The model

## Model 2: Create pipeline that has grid search for hyperparameters and svc


In [None]:
#time the code
t0 = time()
#tokenizer
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer)
#create classifier
classifier = SVC()
#create the pipeline
pipeline = Pipeline(
    [
        ('cleaner', features()),
        ('vectorizer', tfidf),
        ('classifier', classifier)
    ]
)
#paramters
parameters = {
    'vectorizer__max_df': (0.5, 1.0),
    'vectorizer__ngram_range': ((1, 1), (1,2)),  # unigrams or bigrams
    'vectorizer__use_idf': (True, False),
    'classifier__C': (0.1, 1, 10),
    'classifier__kernel': ('linear', 'rbf'),
    'classifier__gamma': ('scale', 'auto')
}

#perform gridsearch
grid_search = GridSearchCV(pipeline, parameters, n_job=None, verbose=1)
grid_search.fit(X_train, y_train)
#time taken
print(f'Time taken: {np.round(time()-t0,2)} seconds')

### Best parameters


In [None]:
print(f"Best score= {np.round(grid_search.best_score_,2)}")
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

### Look at results

In [None]:
target_names=['negative', 'neutral', 'positive']
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(pipeline, X_test, y_test,
                                 display_labels= target_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()