# Sentiment Analysis Classifier
by A4Ayub Notebooks (http://www.a4ayub.me)

### Import Libraries

In [1]:
# Standard Imports
import pandas as pd
import numpy as np
import os 
import json
import warnings

# Data Cleaning Imports
from bs4 import BeautifulSoup
import re

# Modeling Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

# Algorithms to be used
from sklearn.svm import LinearSVC

# Testing
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import classification_report,accuracy_score,precision_score, recall_score, f1_score, confusion_matrix


In [2]:
# Import the data
data_df = pd.read_csv("datasets/sentiments.csv",engine="python");

In [3]:
# Explore the data
data_df.sample(5)

Unnamed: 0,ï»¿Sentiment,SentimentText,Unnamed: 2
7085,0,#hosteurope offline,
34795,0,@ALOliver it's apparently its national i'm a p...,
27160,1,@aarongillespie i am determined to meet you at...,
8918,1,#3wordsaftersex &quot;my husband's home!&quot;...,
11848,0,*almost* breathing well enough to go for a run...,


In [4]:
# check the data types
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39894 entries, 0 to 39893
Data columns (total 3 columns):
ï»¿Sentiment     39894 non-null int64
SentimentText    39894 non-null object
Unnamed: 2       11 non-null object
dtypes: int64(1), object(2)
memory usage: 935.1+ KB


In [5]:
# rename columns
column_dict = {"ï»¿Sentiment":"sentiment","SentimentText":"message"}
data_df.rename(columns=column_dict,inplace=True)

In [6]:
# confirm the changes and explore sample data
data_df = data_df[["sentiment","message"]]
data_df.sample(5)

Unnamed: 0,sentiment,message
26473,0,@a_lecklr I wanna be there! Boo!
33845,0,@allegra0 what?
39760,1,@anddog1 why limit your story to 140 ch. Tell ...
31650,0,@Adrienne_Bailon Aww no! Thats not good
30471,1,@AdiBecic congrats


In [7]:
# Check proportions
data_df["sentiment"].value_counts()

1    20887
0    19007
Name: sentiment, dtype: int64

In [8]:
# Function to clean data
# Perform Data Cleaning on the text
def cleaning(message):
    import nltk
    
    # 1. Remove HTML.
    html_text = BeautifulSoup(message,"html.parser")
    .get_text()
    
    # 2. Remove non-letters.
    letters = re.sub("[^a-zA-Z]", " ", html_text)    
    letters = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', html_text)
    letters = re.sub(' +',' ', html_text)
    letters = re.sub(r"\n", "", letters)    
    letters = re.sub(r"\r", "", letters) 
    letters = re.sub(r"[0-9]", "", letters)
    letters = re.sub(r"\'", "", letters)    
    letters = re.sub(r"\"", "", letters)  
    
    # 3. Remove Any Emails that may be present
    letters = re.sub('\S*@\S*\s?', '', letters)
    
    # 4. Remove any newline characters
    letters = re.sub('\s+', ' ', letters)
    
    # 5. Remove the distracting single quotes
    letters = re.sub(r"\'", "", letters) 
        
    # 6. Convert to lower case.
    letters = letters.lower()
    
    # 7. Tokenize.
    tokens = nltk.word_tokenize(letters)
    
    # 8. Convert the stopwords list to "set" data type.
    #stops = set(nltk.corpus.stopwords.words("english"))
    
    # 9. Remove stop words. 
    #words = [w for w in tokens if not w in stops]
    
    # 10. Stemming
    #words = [nltk.stem.SnowballStemmer('english').stem(w) for w in words]
    
    # 11. Join the words back into one string separated by space, and return the result.
    return " ".join(tokens)

In [9]:
# Perform iterative data cleaning on each of the statements
data_df['cleaned_query'] = data_df['message'].apply(cleaning)
data_df.sample(5)

  ' Beautiful Soup.' % markup)


Unnamed: 0,sentiment,message,cleaned_query
29341,0,@actevil if we ever started a band we should'v...,if we ever started a band we shouldve been cal...
7076,0,#HIN1 #SanLuis streets empty http://www.oem.c...,# hin # sanluis streets empty http : //www.oem...
8576,0,wants to go on Ajax Experience 2009 http://bi...,wants to go on ajax experience http : //bit.ly...
15325,0,*hugs* @Searock_ we get moments like that som...,*hugs* we get moments like that sometimes at t...
24237,0,@21andAMBITIOUS shit tell me the story too! I ...,shit tell me the story too ! i had an outta bo...


In [10]:
# One Hot Encode the target variable
data_df['sentiment_id'], sentiment_mappings = data_df['sentiment'].factorize()
data_df.sample(2)

Unnamed: 0,sentiment,message,cleaned_query,sentiment_id
22978,1,@_spell - you can do it! keep on truckin',- you can do it ! keep on truckin,1
6,1,Juuuuuuuuuuuuuuuuussssst Chillin!!,juuuuuuuuuuuuuuuuussssst chillin ! !,1


In [11]:
# Initialize Term Frequency and Inverse Document Frequency to be used
sentiment_tfidf = TfidfVectorizer(
    sublinear_tf=True, min_df=5, norm='l2', 
    encoding='latin-1', ngram_range=(1, 2), 
    stop_words='english')

In [12]:
# Extract the features from the text
sentiment_features = sentiment_tfidf.
fit_transform(data_df['message'])

In [13]:
print(sentiment_features)

  (0, 6891)	0.6396745431181894
  (0, 3165)	0.7686458735246913
  (1, 5388)	0.33220054519149167
  (1, 5607)	0.27017696154676596
  (1, 5453)	0.41620455985250066
  (1, 8164)	0.438994505690865
  (1, 5618)	0.44697814581861317
  (1, 5454)	0.5009144964119193
  (2, 5845)	0.6566682222194433
  (2, 138)	0.7541795846661163
  (3, 4119)	0.3310383585144315
  (3, 7355)	0.29482378322308933
  (3, 3612)	0.3760688103242043
  (3, 8434)	0.21940341889172538
  (3, 2338)	0.3654774980484365
  (3, 36)	0.32551207994069087
  (3, 4392)	0.16124860934195148
  (3, 2155)	0.4111076852715473
  (3, 4130)	0.42353792397724305
  (4, 7930)	0.2510617513721877
  (4, 5324)	0.4600475910947654
  (4, 1424)	0.4334024559145169
  (4, 1824)	0.514779631823133
  (4, 7730)	0.5220042651101026
  (5, 4392)	0.4500785966897305
  :	:
  (39891, 8627)	0.242550105591907
  (39891, 1758)	0.2866907111182115
  (39891, 4935)	0.18970644528951536
  (39891, 2090)	0.3620282215078353
  (39891, 3271)	0.25248728959462996
  (39891, 1081)	0.38293758393201005
  (

In [14]:
# Extract the labels
sentiment_labels = data_df['sentiment_id']

In [15]:
print(sentiment_labels)

0        0
1        0
2        1
3        0
4        0
5        0
6        1
7        0
8        1
9        1
10       0
11       1
12       0
13       0
14       0
15       0
16       0
17       1
18       0
19       0
20       0
21       0
22       1
23       0
24       0
25       0
26       0
27       0
28       1
29       0
        ..
39864    0
39865    0
39866    1
39867    1
39868    1
39869    0
39870    0
39871    1
39872    0
39873    1
39874    0
39875    0
39876    1
39877    1
39878    0
39879    1
39880    0
39881    1
39882    1
39883    0
39884    0
39885    0
39886    1
39887    1
39888    0
39889    0
39890    1
39891    0
39892    1
39893    1
Name: sentiment_id, Length: 39894, dtype: int64


In [16]:
# View the shape
sentiment_features.shape

(39894, 9077)

In [17]:
# Initialize the algorithm that you will use
sentiment_lsvc_model = LinearSVC()

In [18]:
# split the dataset
X_train, X_test, y_train, y_test, indices_train, 
indices_test = train_test_split(
    sentiment_features, sentiment_labels, data_df.index, 
    test_size=0.33, random_state=12)

In [19]:
# Train the model
sentiment_lsvc_model.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [20]:
# Make predictions
y_pred = sentiment_lsvc_model.predict(X_test)

In [21]:
# View the classification report
print((classification_report(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.71      0.69      0.70      6122
           1       0.74      0.75      0.74      7044

    accuracy                           0.72     13166
   macro avg       0.72      0.72      0.72     13166
weighted avg       0.72      0.72      0.72     13166



In [22]:
# Checking accuracy
accuracy_score(y_test, y_pred)

0.722391007139602

In [23]:
# View the confusion matrix
pd.DataFrame(confusion_matrix(y_test, y_pred))

Unnamed: 0,0,1
0,4224,1898
1,1757,5287


In [24]:
# Test message
test_message = ["I am not so very good but i am ok"]

In [25]:
# Transform the message to a sparse matrix
transformed_message=sentiment_tfidf.transform(test_message)

In [26]:
# Make the prediction
predicted_sentiment = sentiment_lsvc_model.predict(transformed_message)

In [27]:
# echo the prediction
print(predicted_sentiment)

[1]


In [28]:
print(sentiment_mappings.take(predicted_sentiment)[0])

1


## End