In [None]:
# This may require a runtime restart in order to work
!pip install 'pandas==1.3.0'

In [None]:
!pip install config

In [None]:
!pip install 'tweepy==4.4.0'

In [None]:
!pip install pyspellchecker

In [70]:
# Standard Packages
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import re
from wordcloud import WordCloud

# NLTKPackages
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
from nltk.corpus import stopwords

# Nearal Network packages
#import keras
import tensorflow as tf
from sklearn import preprocessing, model_selection
#from keras.models import Sequential, load_model 
#from keras.layers import Dense, Dropout, Activation
from tensorflow.keras.callbacks import ModelCheckpoint

# Sklearn pacckages 
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Loading the dataset for mac users
FILE_PATH = '/content/drive/MyDrive/ML/GBC/DL1/Project/'
RANDOM_SEED = 42

# Load the data set
data = pd.read_csv(
    FILE_PATH + 'sentiment_analysis_dataset.csv',
    sep=',',
    on_bad_lines='skip',
    encoding='latin-1'
    )

print(data.shape)
data.head(5)

In [71]:
# Loading the dataset for PC users
data = pd.read_csv('sentiment analysis dataset.csv', sep=',', on_bad_lines='skip', encoding='latin-1')
print(data.shape)
data.head(5)

(1578612, 4)


Unnamed: 0,ï»¿ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [72]:
# check for null data
data.isna().sum()

ï»¿ItemID          0
Sentiment          0
SentimentSource    0
SentimentText      0
dtype: int64

In [73]:
# The Item ID column is not useful for us, drop it
# We also tested using 1/3 of the dataset and ran out of memory, 1/4 seems to be our limit
data = data.drop(['ï»¿ItemID'], axis=1)
data = data.sample(frac=1/15).reset_index(drop=True)
data.shape

(105241, 3)

In [5]:
# Clean text
import re
from string import punctuation
from collections import Counter

from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

def clean_text_for_tfidf_vectorizer(text):
    spell = SpellChecker()
    lemmatizer = WordNetLemmatizer()
    tk = TweetTokenizer()

    stopword_list = stopwords.words('english')
    new_stop_words=['i', 'im', 'http', 'ive', 'rt']
    for i in new_stop_words:
        stopword_list.append(i)

    cleaned_text = []
    punctuation_counts = []

    for sentence in text:
        cleaned_words = []

        punctuation_count = lambda l1,l2: sum([1 for x in l1 if x in l2])
        punctuation_counts.append(punctuation_count(sentence,set(punctuation)))

        for word in tk.tokenize(sentence):
            # Spell check
            word = spell.correction(word.lower())

            # Remove stop words
            if word in stopword_list:
              continue

            # Remove numbers and punctuation
            word = re.sub('[^a-zA-Z]+', '', word)

            if (word == ''):
              continue

            # Lemmatize
            word = lemmatizer.lemmatize(word)

            cleaned_words.append(word)

        cleaned_text.append(' '.join(cleaned_words))
      
    return cleaned_text, punctuation_counts

In [6]:
# Example of cleaning code in action
text = ['Hello, my name is Daniel.', 'This is my example sentence!', 'I am exxaggerating and have mispelled a word!!!!!']
cleaned_text, punctuation_counts = clean_text_for_tfidf_vectorizer(text)

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
tfidf_vect_w_mat = tfidf_vectorizer.fit_transform(cleaned_text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame (tfidf_vect_w_mat.todense())
tfidf_df.columns = tfidf_feature_names # This may be a very expensive operation, consider commenting out
tfidf_df['punctuation_count'] = punctuation_counts
print('Original sentences:')
print(text)
print()
print('Cleaned sentences:')
print(cleaned_text)
print()
print('tfidf feature names:')
print(tfidf_feature_names)
print()
print('tfidf df with punctuation count:')
tfidf_df

Original sentences:
['Hello, my name is Daniel.', 'This is my example sentence!', 'I am exxaggerating and have mispelled a word!!!!!']

Cleaned sentences:
['hello name daniel', 'example sentence', 'exaggerating misspelled word']

tfidf feature names:
['daniel' 'exaggerating' 'exaggerating misspelled'
 'exaggerating misspelled word' 'example' 'example sentence' 'hello'
 'hello name' 'hello name daniel' 'misspelled' 'misspelled word' 'name'
 'name daniel' 'sentence' 'word']

tfidf df with punctuation count:


Unnamed: 0,daniel,exaggerating,exaggerating misspelled,exaggerating misspelled word,example,example sentence,hello,hello name,hello name daniel,misspelled,misspelled word,name,name daniel,sentence,word,punctuation_count
0,0.408248,0.0,0.0,0.0,0.0,0.0,0.408248,0.408248,0.408248,0.0,0.0,0.408248,0.408248,0.0,0.0,2
1,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,1
2,0.0,0.408248,0.408248,0.408248,0.0,0.0,0.0,0.0,0.0,0.408248,0.408248,0.0,0.0,0.0,0.408248,5


In [12]:
# Checking the run time of the script on a small subset of the dataset
text = data['SentimentText']
text = text.loc[1:500]

print('cleaned text started')
cleaned_text, punctuation_counts = %time clean_text_for_tfidf_vectorizer(text)
print('cleaned text completed')
 
tfidf_vectorizer = %time TfidfVectorizer(ngram_range=(1,3))

print('tfidf vectorizer started')
tfidf_vect_w_mat = %time tfidf_vectorizer.fit_transform(cleaned_text)
print('tfidf vectorizer completed')

tfidf_feature_names = %time tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame (tfidf_vect_w_mat.todense())

print('column names started')
%time tfidf_df.columns = tfidf_feature_names # This may be a very expensive operation, consider commenting out
print('column names completed')

tfidf_df['punctuation_count'] = punctuation_counts

cleaned text started
Wall time: 1min 53s
cleaned text completed
Wall time: 0 ns
tfidf vectorizer started
Wall time: 14 ms
tfidf vectorizer completed
Wall time: 1.02 ms
column names started
Wall time: 0 ns
column names completed


We tried this function to clean the dataset and benchmarked it's performance.  Only 500 tweets took almost 2 minutes, checked to see if we can reduce the time

In [7]:
from spellchecker import SpellChecker
text = data['SentimentText']
text = text.loc[1:500]
processedText = []
punctuation_counts = []

def preprocess(textdata):
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    spell = SpellChecker()
    
    for tweet in textdata:
        punctuation_count = lambda l1,l2: sum([1 for x in l1 if x in l2])
        punctuation_counts.append(punctuation_count(tweet,set(punctuation)))
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub('[^a-zA-Z]+', ' ', tweet)      
        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if len(word)>1:
                # Spell check the word
                word = spell.correction(word)
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

print('Starting Function')
%time preprocess(text)
print('Ending Function')

tfidf_vectorizer = %time TfidfVectorizer(ngram_range=(1,3))

print('tfidf vectorizer started')
tfidf_vect_w_mat = %time tfidf_vectorizer.fit_transform(processedText)
print('tfidf vectorizer completed')

tfidf_feature_names = %time tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame (tfidf_vect_w_mat.todense())

print('column names started')
%time tfidf_df.columns = tfidf_feature_names # This may be a very expensive operation, consider commenting out
print('column names completed')

tfidf_df['punctuation_count'] = punctuation_counts

Starting Function
Wall time: 57.3 s
Ending Function
Wall time: 0 ns
tfidf vectorizer started
Wall time: 27.2 ms
tfidf vectorizer completed
Wall time: 2.99 ms
column names started
Wall time: 997 µs
column names completed


After various testing, the expensive function is the spell checker.  We will be creating 2 versions of the database, one with spell checker and one without so we can move forward with model building
while the spell checker compiles

In [74]:
text = data['SentimentText']
processedText = []
punctuation_counts = []


def preprocess(textdata):

    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    spell = SpellChecker()
    
    for tweet in textdata:
        punctuation_count = lambda l1,l2: sum([1 for x in l1 if x in l2])
        punctuation_counts.append(punctuation_count(tweet,set(punctuation)))
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub('[^a-zA-Z]+', ' ', tweet)
        sequencePattern   = r"(.)\1\1+"
        seqReplacePattern = r"\1\1"
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        
        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if len(word)>1:
                # Spell check the word
                word = spell.correction(word)
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

print('Starting Function')
%time preprocess(text)
print('Ending Function')

tfidf_vectorizer = %time TfidfVectorizer(ngram_range=(1,3))

print('tfidf vectorizer started')
tfidf_vect_w_mat = %time tfidf_vectorizer.fit_transform(processedText)
print('tfidf vectorizer completed')



Starting Function
Wall time: 3h 5min 11s
Ending Function
Wall time: 0 ns
tfidf vectorizer started
Wall time: 5.97 s
tfidf vectorizer completed


In [51]:
tfidf_vect_w_mat

<31572x507414 sparse matrix of type '<class 'numpy.float64'>'
	with 1062004 stored elements in Compressed Sparse Row format>

In [77]:
cleaned_text = pd.DataFrame(processedText, columns = ['column_name'])

In [78]:
cleaned_text.head()

Unnamed: 0,column_name
0,kenichan ugh hate that quit foot in agony quit...
1,pandi no no em do no no no utweetme cho ti he he
2,wish it wa time to get train home today seems ...
3,at work forgot my lunch have to queue up with ...
4,janeejohnson if all say so trust you on that one


In [79]:
cleaned_text['punctuation_count'] = punctuation_counts

In [80]:
#Created a .csv for review
cleaned_text.to_csv('cleaned_text.csv')

In [75]:
tfidf_feature_names = %time tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame (tfidf_vect_w_mat.todense())

print('column names started')
%time tfidf_df.columns = tfidf_feature_names # This may be a very expensive operation, consider commenting out
print('column names completed')

tfidf_df['punctuation_count'] = punctuation_counts

Wall time: 1.18 s


MemoryError: Unable to allocate 1.07 TiB for an array with shape (105241, 1393267) and data type float64

In [None]:
# Drop columns
data = data.drop(['ï»¿ItemID','SentimentSource','SentimentText','X_Clean'],axis =1)


# create df that sentiment and clean text
tfid_data = pd.concat([data,df_], axis=1)

# Create a positive and Negative word Cloud

# 1= postive 
neg_tweets = tfid_data[tfid_data.Sentiment == 1]
neg_string = []
for t in tfid_data.X_Clean:
    neg_string.append(t)
neg_string = pd.Series(neg_string).str.cat(sep=' ')

wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(neg_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# 0-=negative
pos_tweets = tfid_data[tfid_data.Sentiment == 0]
pos_string = []
for t in tfid_data.X_Clean:
    pos_string.append(t)
pos_string = pd.Series(pos_string).str.cat(sep=' ')
wordcloud = WordCloud(width=1600, height=800,max_font_size=200,colormap='magma').generate(pos_string) 
plt.figure(figsize=(12,10)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis("off") 
plt.show()

# Amount of words to be used in the TFIDF
max_words = 2000


#-- tf-idf --> Word level ----------------------------------------------------
tfidf_vect_w = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             max_features=max_words)

tfidf_vect_w_mat = tfidf_vect_w.fit_transform(tfid_data.iloc[:,1])
tfidf_w_names = tfidf_vect_w.get_feature_names()

DF_tfidf_word = pd.DataFrame (tfidf_vect_w_mat.todense())


#-- Assign the column names
DF_tfidf_word.columns = tfidf_w_names


# Prepare data for training

X = np.asarray(DF_tfidf_word, dtype='float64') 

Y = tfid_data.iloc[:,0]
ttrain_y = tf.keras.utils.to_categorical(Y, num_classes=2, dtype='float32')


# Split the data into train and test data
train_x, test_x, train_y, test_y = model_selection.train_test_split(X,ttrain_y,test_size = 0.2, random_state = 0)


# Define your Neural Network
model = Sequential()
model.add(Dense(550, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(225, activation='sigmoid'))
model.add(Dropout(0.3))
model.add(Dense(2, activation='softmax'))
    
model.compile(loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy'])   
    

# Fit your model
checkpointer = ModelCheckpoint(filepath="weights.hdf5", monitor = 'val_acc', verbose=1, save_best_only=True) 

history = model.fit(
                    train_x, 
                    train_y, 
                    epochs=5, 
                    batch_size=80, 
                    callbacks = [checkpointer], 
                    validation_data = (test_x, test_y)
                    )


# Save the model as twitter.h5
model_name = "twitter.h5"
model.save(model_name)
model = load_model(model_name)

y_pred = model.predict(test_x)
y_pred = np.argmax(y_pred, axis=1)

test_y=np.argmax(test_y, axis =1)


# 1= postive 
# 0-=negative
labels = ['negative', 'positive']

# Create a Confusion Matrix
cm = confusion_matrix(y_pred, test_y)

print(history.history.keys())


# Plot your training and validation curves
plt.subplots() # open a new plot
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.show()

plt.subplots() # open a new plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.show()




